Merge pull request #61 from jhuapl-lglenden/develop-updates

Updates from latest development branch.
2026-01-08 20:17:54 -05:00 · 2022-02-04 18:07:08 -05:00
parent 50de250f12 ca3266f530
commit 10fa771057
35 changed files with 4033 additions and 1493 deletions
--- a/.env
+++ b/.env
@@ -10,6 +10,7 @@ EVE_DB_VOLUME=eve_db
 OPENNLP_ID=5babb6ee4eb7dd2c39b9671c
 CORENLP_ID=5babb6ee4eb7dd2c39b9671d
 DOCUMENT_CLASSIFIER_ID=5babb6ee4eb7dd2c39b9671b
+SIMPLETRANSFORMERS_ID=5babb6ee4eb7dd2c39b96720

 EXPOSED_SERVER_TYPE=https
 EXPOSED_SERVER_NAME=localhost
--- a/backend/Pipfile
+++ b/backend/Pipfile
@@ -25,6 +25,7 @@ scipy = "~=1.7.1"
 tabulate = "~=0.8.9"
 multiprocessing-logging = "~=0.3.1"
 flask-httpauth = "~=4.4.0"
+lxml = "~=4.6.3"

 [dev-packages]

--- a/backend/Pipfile.lock
+++ b/backend/Pipfile.lock
@@ -1,7 +1,7 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "3a30a022ccd4fbe028c8cf2c8f741b9c7f7fa72e039dba391da62a20e58c5273"
+            "sha256": "382dbc37f5349e1a9d22b266891cad743be81ff76fe395c112c157b6a110ed62"
        },
        "pipfile-spec": 6,
        "requires": {
@@ -55,10 +55,10 @@
        },
        "certifi": {
            "hashes": [
-                "sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee",
-                "sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8"
+                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
+                "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
            ],
-            "version": "==2021.5.30"
+            "version": "==2021.10.8"
        },
        "cffi": {
            "hashes": [
@@ -112,39 +112,45 @@
        },
        "charset-normalizer": {
            "hashes": [
-                "sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b",
-                "sha256:f23667ebe1084be45f6ae0538e4a5a865206544097e4e8bbcacf42cd02a348f3"
+                "sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0",
+                "sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"
            ],
            "markers": "python_version >= '3'",
-            "version": "==2.0.4"
+            "version": "==2.0.7"
        },
        "click": {
            "hashes": [
-                "sha256:8c04c11192119b1ef78ea049e0a6f0463e4c48ef00a30160c704337586f3ad7a",
-                "sha256:fba402a4a47334742d782209a7c79bc448911afe1149d07bdabdf480b3e2f4b6"
+                "sha256:353f466495adaeb40b6b5f592f9f91cb22372351c84caeb068132442a4518ef3",
+                "sha256:410e932b050f5eed773c4cda94de75971c89cdb3155a72a0831139a79e5ecb5b"
            ],
            "markers": "python_version >= '3.6'",
-            "version": "==8.0.1"
+            "version": "==8.0.3"
        },
        "cryptography": {
            "hashes": [
-                "sha256:0f1212a66329c80d68aeeb39b8a16d54ef57071bf22ff4e521657b27372e327d",
-                "sha256:1e056c28420c072c5e3cb36e2b23ee55e260cb04eee08f702e0edfec3fb51959",
-                "sha256:240f5c21aef0b73f40bb9f78d2caff73186700bf1bc6b94285699aff98cc16c6",
-                "sha256:26965837447f9c82f1855e0bc8bc4fb910240b6e0d16a664bb722df3b5b06873",
-                "sha256:37340614f8a5d2fb9aeea67fd159bfe4f5f4ed535b1090ce8ec428b2f15a11f2",
-                "sha256:3d10de8116d25649631977cb37da6cbdd2d6fa0e0281d014a5b7d337255ca713",
-                "sha256:3d8427734c781ea5f1b41d6589c293089704d4759e34597dce91014ac125aad1",
-                "sha256:7ec5d3b029f5fa2b179325908b9cd93db28ab7b85bb6c1db56b10e0b54235177",
-                "sha256:8e56e16617872b0957d1c9742a3f94b43533447fd78321514abbe7db216aa250",
-                "sha256:b01fd6f2737816cb1e08ed4807ae194404790eac7ad030b34f2ce72b332f5586",
-                "sha256:bf40af59ca2465b24e54f671b2de2c59257ddc4f7e5706dbd6930e26823668d3",
-                "sha256:de4e5f7f68220d92b7637fc99847475b59154b7a1b3868fb7385337af54ac9ca",
-                "sha256:eb8cc2afe8b05acbd84a43905832ec78e7b3873fb124ca190f574dca7389a87d",
-                "sha256:ee77aa129f481be46f8d92a1a7db57269a2f23052d5f2433b4621bb457081cc9"
+                "sha256:07bb7fbfb5de0980590ddfc7f13081520def06dc9ed214000ad4372fb4e3c7f6",
+                "sha256:18d90f4711bf63e2fb21e8c8e51ed8189438e6b35a6d996201ebd98a26abbbe6",
+                "sha256:1ed82abf16df40a60942a8c211251ae72858b25b7421ce2497c2eb7a1cee817c",
+                "sha256:22a38e96118a4ce3b97509443feace1d1011d0571fae81fc3ad35f25ba3ea999",
+                "sha256:2d69645f535f4b2c722cfb07a8eab916265545b3475fdb34e0be2f4ee8b0b15e",
+                "sha256:4a2d0e0acc20ede0f06ef7aa58546eee96d2592c00f450c9acb89c5879b61992",
+                "sha256:54b2605e5475944e2213258e0ab8696f4f357a31371e538ef21e8d61c843c28d",
+                "sha256:7075b304cd567694dc692ffc9747f3e9cb393cc4aa4fb7b9f3abd6f5c4e43588",
+                "sha256:7b7ceeff114c31f285528ba8b390d3e9cfa2da17b56f11d366769a807f17cbaa",
+                "sha256:7eba2cebca600a7806b893cb1d541a6e910afa87e97acf2021a22b32da1df52d",
+                "sha256:928185a6d1ccdb816e883f56ebe92e975a262d31cc536429041921f8cb5a62fd",
+                "sha256:9933f28f70d0517686bd7de36166dda42094eac49415459d9bdf5e7df3e0086d",
+                "sha256:a688ebcd08250eab5bb5bca318cc05a8c66de5e4171a65ca51db6bd753ff8953",
+                "sha256:abb5a361d2585bb95012a19ed9b2c8f412c5d723a9836418fab7aaa0243e67d2",
+                "sha256:c10c797ac89c746e488d2ee92bd4abd593615694ee17b2500578b63cad6b93a8",
+                "sha256:ced40344e811d6abba00295ced98c01aecf0c2de39481792d87af4fa58b7b4d6",
+                "sha256:d57e0cdc1b44b6cdf8af1d01807db06886f10177469312fbde8f44ccbb284bc9",
+                "sha256:d99915d6ab265c22873f1b4d6ea5ef462ef797b4140be4c9d8b179915e0985c6",
+                "sha256:eb80e8a1f91e4b7ef8b33041591e6d89b2b8e122d787e87eeb2b08da71bb16ad",
+                "sha256:ebeddd119f526bcf323a89f853afb12e225902a24d29b55fe18dd6fcb2838a76"
            ],
            "markers": "python_version >= '3.6'",
-            "version": "==3.4.7"
+            "version": "==35.0.0"
        },
        "cycler": {
            "hashes": [
@@ -155,11 +161,11 @@
        },
        "flask": {
            "hashes": [
-                "sha256:1c4c257b1892aec1398784c63791cbaa43062f1f7aeb555c4da961b20ee68f55",
-                "sha256:a6209ca15eb63fc9385f38e452704113d679511d9574d09b2cf9183ae7d20dc9"
+                "sha256:7b2fb8e934ddd50731893bdcdb00fc8c0315916f9fcd50d22c7cc1a95ab634e2",
+                "sha256:cb90f62f1d8e4dc4621f52106613488b5ba826b2e1e10a33eac92f723093ab6a"
            ],
            "index": "pypi",
-            "version": "==2.0.1"
+            "version": "==2.0.2"
        },
        "flask-cors": {
            "hashes": [
@@ -195,49 +201,115 @@
        },
        "jinja2": {
            "hashes": [
-                "sha256:1f06f2da51e7b56b8f238affdd6b4e2c61e39598a378cc49345bc1bd42a978a4",
-                "sha256:703f484b47a6af502e743c9122595cc812b0271f661722403114f71a79d0f5a4"
+                "sha256:827a0e32839ab1600d4eb1c4c33ec5a8edfbc5cb42dafa13b81f182f97784b45",
+                "sha256:8569982d3f0889eed11dd620c706d39b60c36d6d25843961f33f77fb6bc6b20c"
            ],
            "markers": "python_version >= '3.6'",
-            "version": "==3.0.1"
+            "version": "==3.0.2"
        },
        "kiwisolver": {
            "hashes": [
-                "sha256:0cd53f403202159b44528498de18f9285b04482bab2a6fc3f5dd8dbb9352e30d",
-                "sha256:1e1bc12fb773a7b2ffdeb8380609f4f8064777877b2225dec3da711b421fda31",
-                "sha256:225e2e18f271e0ed8157d7f4518ffbf99b9450fca398d561eb5c4a87d0986dd9",
-                "sha256:232c9e11fd7ac3a470d65cd67e4359eee155ec57e822e5220322d7b2ac84fbf0",
-                "sha256:31dfd2ac56edc0ff9ac295193eeaea1c0c923c0355bf948fbd99ed6018010b72",
-                "sha256:33449715e0101e4d34f64990352bce4095c8bf13bed1b390773fc0a7295967b3",
-                "sha256:401a2e9afa8588589775fe34fc22d918ae839aaaf0c0e96441c0fdbce6d8ebe6",
-                "sha256:44a62e24d9b01ba94ae7a4a6c3fb215dc4af1dde817e7498d901e229aaf50e4e",
-                "sha256:50af681a36b2a1dee1d3c169ade9fdc59207d3c31e522519181e12f1b3ba7000",
-                "sha256:563c649cfdef27d081c84e72a03b48ea9408c16657500c312575ae9d9f7bc1c3",
-                "sha256:5989db3b3b34b76c09253deeaf7fbc2707616f130e166996606c284395da3f18",
-                "sha256:5a7a7dbff17e66fac9142ae2ecafb719393aaee6a3768c9de2fd425c63b53e21",
-                "sha256:5c3e6455341008a054cccee8c5d24481bcfe1acdbc9add30aa95798e95c65621",
-                "sha256:5f6ccd3dd0b9739edcf407514016108e2280769c73a85b9e59aa390046dbf08b",
-                "sha256:72c99e39d005b793fb7d3d4e660aed6b6281b502e8c1eaf8ee8346023c8e03bc",
-                "sha256:78751b33595f7f9511952e7e60ce858c6d64db2e062afb325985ddbd34b5c131",
-                "sha256:834ee27348c4aefc20b479335fd422a2c69db55f7d9ab61721ac8cd83eb78882",
-                "sha256:8be8d84b7d4f2ba4ffff3665bcd0211318aa632395a1a41553250484a871d454",
-                "sha256:950a199911a8d94683a6b10321f9345d5a3a8433ec58b217ace979e18f16e248",
-                "sha256:a357fd4f15ee49b4a98b44ec23a34a95f1e00292a139d6015c11f55774ef10de",
-                "sha256:a53d27d0c2a0ebd07e395e56a1fbdf75ffedc4a05943daf472af163413ce9598",
-                "sha256:acef3d59d47dd85ecf909c359d0fd2c81ed33bdff70216d3956b463e12c38a54",
-                "sha256:b38694dcdac990a743aa654037ff1188c7a9801ac3ccc548d3341014bc5ca278",
-                "sha256:b9edd0110a77fc321ab090aaa1cfcaba1d8499850a12848b81be2222eab648f6",
-                "sha256:c08e95114951dc2090c4a630c2385bef681cacf12636fb0241accdc6b303fd81",
-                "sha256:c5518d51a0735b1e6cee1fdce66359f8d2b59c3ca85dc2b0813a8aa86818a030",
-                "sha256:c8fd0f1ae9d92b42854b2979024d7597685ce4ada367172ed7c09edf2cef9cb8",
-                "sha256:ca3820eb7f7faf7f0aa88de0e54681bddcb46e485beb844fcecbcd1c8bd01689",
-                "sha256:cf8b574c7b9aa060c62116d4181f3a1a4e821b2ec5cbfe3775809474113748d4",
-                "sha256:d3155d828dec1d43283bd24d3d3e0d9c7c350cdfcc0bd06c0ad1209c1bbc36d0",
-                "sha256:f8d6f8db88049a699817fd9178782867bf22283e3813064302ac59f61d95be05",
-                "sha256:fd34fbbfbc40628200730bc1febe30631347103fc8d3d4fa012c21ab9c11eca9"
+                "sha256:0007840186bacfaa0aba4466d5890334ea5938e0bb7e28078a0eb0e63b5b59d5",
+                "sha256:19554bd8d54cf41139f376753af1a644b63c9ca93f8f72009d50a2080f870f77",
+                "sha256:1d45d1c74f88b9f41062716c727f78f2a59a5476ecbe74956fafb423c5c87a76",
+                "sha256:1d819553730d3c2724582124aee8a03c846ec4362ded1034c16fb3ef309264e6",
+                "sha256:2210f28778c7d2ee13f3c2a20a3a22db889e75f4ec13a21072eabb5693801e84",
+                "sha256:22521219ca739654a296eea6d4367703558fba16f98688bd8ce65abff36eaa84",
+                "sha256:25405f88a37c5f5bcba01c6e350086d65e7465fd1caaf986333d2a045045a223",
+                "sha256:2b65bd35f3e06a47b5c30ea99e0c2b88f72c6476eedaf8cfbc8e66adb5479dcf",
+                "sha256:2ddb500a2808c100e72c075cbb00bf32e62763c82b6a882d403f01a119e3f402",
+                "sha256:2f8f6c8f4f1cff93ca5058d6ec5f0efda922ecb3f4c5fb76181f327decff98b8",
+                "sha256:30fa008c172355c7768159983a7270cb23838c4d7db73d6c0f6b60dde0d432c6",
+                "sha256:3dbb3cea20b4af4f49f84cffaf45dd5f88e8594d18568e0225e6ad9dec0e7967",
+                "sha256:4116ba9a58109ed5e4cb315bdcbff9838f3159d099ba5259c7c7fb77f8537492",
+                "sha256:44e6adf67577dbdfa2d9f06db9fbc5639afefdb5bf2b4dfec25c3a7fbc619536",
+                "sha256:5326ddfacbe51abf9469fe668944bc2e399181a2158cb5d45e1d40856b2a0589",
+                "sha256:70adc3658138bc77a36ce769f5f183169bc0a2906a4f61f09673f7181255ac9b",
+                "sha256:72be6ebb4e92520b9726d7146bc9c9b277513a57a38efcf66db0620aec0097e0",
+                "sha256:7843b1624d6ccca403a610d1277f7c28ad184c5aa88a1750c1a999754e65b439",
+                "sha256:7ba5a1041480c6e0a8b11a9544d53562abc2d19220bfa14133e0cdd9967e97af",
+                "sha256:80efd202108c3a4150e042b269f7c78643420cc232a0a771743bb96b742f838f",
+                "sha256:82f49c5a79d3839bc8f38cb5f4bfc87e15f04cbafa5fbd12fb32c941cb529cfb",
+                "sha256:83d2c9db5dfc537d0171e32de160461230eb14663299b7e6d18ca6dca21e4977",
+                "sha256:8d93a1095f83e908fc253f2fb569c2711414c0bfd451cab580466465b235b470",
+                "sha256:8dc3d842fa41a33fe83d9f5c66c0cc1f28756530cd89944b63b072281e852031",
+                "sha256:9661a04ca3c950a8ac8c47f53cbc0b530bce1b52f516a1e87b7736fec24bfff0",
+                "sha256:a498bcd005e8a3fedd0022bb30ee0ad92728154a8798b703f394484452550507",
+                "sha256:a7a4cf5bbdc861987a7745aed7a536c6405256853c94abc9f3287c3fa401b174",
+                "sha256:b5074fb09429f2b7bc82b6fb4be8645dcbac14e592128beeff5461dcde0af09f",
+                "sha256:b6a5431940f28b6de123de42f0eb47b84a073ee3c3345dc109ad550a3307dd28",
+                "sha256:ba677bcaff9429fd1bf01648ad0901cea56c0d068df383d5f5856d88221fe75b",
+                "sha256:bcadb05c3d4794eb9eee1dddf1c24215c92fb7b55a80beae7a60530a91060560",
+                "sha256:bf7eb45d14fc036514c09554bf983f2a72323254912ed0c3c8e697b62c4c158f",
+                "sha256:c358721aebd40c243894298f685a19eb0491a5c3e0b923b9f887ef1193ddf829",
+                "sha256:c4550a359c5157aaf8507e6820d98682872b9100ce7607f8aa070b4b8af6c298",
+                "sha256:c6572c2dab23c86a14e82c245473d45b4c515314f1f859e92608dcafbd2f19b8",
+                "sha256:cba430db673c29376135e695c6e2501c44c256a81495da849e85d1793ee975ad",
+                "sha256:dedc71c8eb9c5096037766390172c34fb86ef048b8e8958b4e484b9e505d66bc",
+                "sha256:e6f5eb2f53fac7d408a45fbcdeda7224b1cfff64919d0f95473420a931347ae9",
+                "sha256:ec2eba188c1906b05b9b49ae55aae4efd8150c61ba450e6721f64620c50b59eb",
+                "sha256:ee040a7de8d295dbd261ef2d6d3192f13e2b08ec4a954de34a6fb8ff6422e24c",
+                "sha256:eedd3b59190885d1ebdf6c5e0ca56828beb1949b4dfe6e5d0256a461429ac386",
+                "sha256:f441422bb313ab25de7b3dbfd388e790eceb76ce01a18199ec4944b369017009",
+                "sha256:f8eb7b6716f5b50e9c06207a14172cf2de201e41912ebe732846c02c830455b9",
+                "sha256:fc4453705b81d03568d5b808ad8f09c77c47534f6ac2e72e733f9ca4714aa75c"
            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==1.3.1"
+            "markers": "python_version >= '3.7'",
+            "version": "==1.3.2"
+        },
+        "lxml": {
+            "hashes": [
+                "sha256:079f3ae844f38982d156efce585bc540c16a926d4436712cf4baee0cce487a3d",
+                "sha256:0fbcf5565ac01dff87cbfc0ff323515c823081c5777a9fc7703ff58388c258c3",
+                "sha256:122fba10466c7bd4178b07dba427aa516286b846b2cbd6f6169141917283aae2",
+                "sha256:1b38116b6e628118dea5b2186ee6820ab138dbb1e24a13e478490c7db2f326ae",
+                "sha256:1b7584d421d254ab86d4f0b13ec662a9014397678a7c4265a02a6d7c2b18a75f",
+                "sha256:26e761ab5b07adf5f555ee82fb4bfc35bf93750499c6c7614bd64d12aaa67927",
+                "sha256:289e9ca1a9287f08daaf796d96e06cb2bc2958891d7911ac7cae1c5f9e1e0ee3",
+                "sha256:2a9d50e69aac3ebee695424f7dbd7b8c6d6eb7de2a2eb6b0f6c7db6aa41e02b7",
+                "sha256:3082c518be8e97324390614dacd041bb1358c882d77108ca1957ba47738d9d59",
+                "sha256:33bb934a044cf32157c12bfcfbb6649807da20aa92c062ef51903415c704704f",
+                "sha256:3439c71103ef0e904ea0a1901611863e51f50b5cd5e8654a151740fde5e1cade",
+                "sha256:36108c73739985979bf302006527cf8a20515ce444ba916281d1c43938b8bb96",
+                "sha256:39b78571b3b30645ac77b95f7c69d1bffc4cf8c3b157c435a34da72e78c82468",
+                "sha256:4289728b5e2000a4ad4ab8da6e1db2e093c63c08bdc0414799ee776a3f78da4b",
+                "sha256:4bff24dfeea62f2e56f5bab929b4428ae6caba2d1eea0c2d6eb618e30a71e6d4",
+                "sha256:4c61b3a0db43a1607d6264166b230438f85bfed02e8cff20c22e564d0faff354",
+                "sha256:542d454665a3e277f76954418124d67516c5f88e51a900365ed54a9806122b83",
+                "sha256:5a0a14e264069c03e46f926be0d8919f4105c1623d620e7ec0e612a2e9bf1c04",
+                "sha256:5c8c163396cc0df3fd151b927e74f6e4acd67160d6c33304e805b84293351d16",
+                "sha256:64812391546a18896adaa86c77c59a4998f33c24788cadc35789e55b727a37f4",
+                "sha256:66e575c62792c3f9ca47cb8b6fab9e35bab91360c783d1606f758761810c9791",
+                "sha256:6f12e1427285008fd32a6025e38e977d44d6382cf28e7201ed10d6c1698d2a9a",
+                "sha256:74f7d8d439b18fa4c385f3f5dfd11144bb87c1da034a466c5b5577d23a1d9b51",
+                "sha256:7610b8c31688f0b1be0ef882889817939490a36d0ee880ea562a4e1399c447a1",
+                "sha256:76fa7b1362d19f8fbd3e75fe2fb7c79359b0af8747e6f7141c338f0bee2f871a",
+                "sha256:7728e05c35412ba36d3e9795ae8995e3c86958179c9770e65558ec3fdfd3724f",
+                "sha256:8157dadbb09a34a6bd95a50690595e1fa0af1a99445e2744110e3dca7831c4ee",
+                "sha256:820628b7b3135403540202e60551e741f9b6d3304371712521be939470b454ec",
+                "sha256:884ab9b29feaca361f7f88d811b1eea9bfca36cf3da27768d28ad45c3ee6f969",
+                "sha256:89b8b22a5ff72d89d48d0e62abb14340d9e99fd637d046c27b8b257a01ffbe28",
+                "sha256:92e821e43ad382332eade6812e298dc9701c75fe289f2a2d39c7960b43d1e92a",
+                "sha256:b007cbb845b28db4fb8b6a5cdcbf65bacb16a8bd328b53cbc0698688a68e1caa",
+                "sha256:bc4313cbeb0e7a416a488d72f9680fffffc645f8a838bd2193809881c67dd106",
+                "sha256:bccbfc27563652de7dc9bdc595cb25e90b59c5f8e23e806ed0fd623755b6565d",
+                "sha256:c1a40c06fd5ba37ad39caa0b3144eb3772e813b5fb5b084198a985431c2f1e8d",
+                "sha256:c47ff7e0a36d4efac9fd692cfa33fbd0636674c102e9e8d9b26e1b93a94e7617",
+                "sha256:c4f05c5a7c49d2fb70223d0d5bcfbe474cf928310ac9fa6a7c6dddc831d0b1d4",
+                "sha256:cdaf11d2bd275bf391b5308f86731e5194a21af45fbaaaf1d9e8147b9160ea92",
+                "sha256:ce256aaa50f6cc9a649c51be3cd4ff142d67295bfc4f490c9134d0f9f6d58ef0",
+                "sha256:d2e35d7bf1c1ac8c538f88d26b396e73dd81440d59c1ef8522e1ea77b345ede4",
+                "sha256:d916d31fd85b2f78c76400d625076d9124de3e4bda8b016d25a050cc7d603f24",
+                "sha256:df7c53783a46febb0e70f6b05df2ba104610f2fb0d27023409734a3ecbb78fb2",
+                "sha256:e1cbd3f19a61e27e011e02f9600837b921ac661f0c40560eefb366e4e4fb275e",
+                "sha256:efac139c3f0bf4f0939f9375af4b02c5ad83a622de52d6dfa8e438e8e01d0eb0",
+                "sha256:efd7a09678fd8b53117f6bae4fa3825e0a22b03ef0a932e070c0bdbb3a35e654",
+                "sha256:f2380a6376dfa090227b663f9678150ef27543483055cc327555fb592c5967e2",
+                "sha256:f8380c03e45cf09f8557bdaa41e1fa7c81f3ae22828e1db470ab2a6c96d8bc23",
+                "sha256:f90ba11136bfdd25cae3951af8da2e95121c9b9b93727b1b896e3fa105b2f586"
+            ],
+            "index": "pypi",
+            "version": "==4.6.3"
        },
        "markupsafe": {
            "hashes": [
@@ -387,56 +459,70 @@
        },
        "pebble": {
            "hashes": [
-                "sha256:556de0f4c65f943b73ba85ab4621f18000864d42a9d562c470ce7bf396d96424",
-                "sha256:b0abdc8830c21307038d63454584f71c2943e542e4e9d4c86d67aebc06c3519b"
+                "sha256:46e02767b239a29b8150466514fabb5c6632bea8c9b7456dfdb715f4636fc8a3",
+                "sha256:694e1105db888f3576b8f00662f90b057cf3780e6f8b7f57955a568008d0f497"
            ],
            "index": "pypi",
-            "version": "==4.6.1"
+            "version": "==4.6.3"
        },
        "pillow": {
            "hashes": [
-                "sha256:0b2efa07f69dc395d95bb9ef3299f4ca29bcb2157dc615bae0b42c3c20668ffc",
-                "sha256:114f816e4f73f9ec06997b2fde81a92cbf0777c9e8f462005550eed6bae57e63",
-                "sha256:147bd9e71fb9dcf08357b4d530b5167941e222a6fd21f869c7911bac40b9994d",
-                "sha256:15a2808e269a1cf2131930183dcc0419bc77bb73eb54285dde2706ac9939fa8e",
-                "sha256:196560dba4da7a72c5e7085fccc5938ab4075fd37fe8b5468869724109812edd",
-                "sha256:1c03e24be975e2afe70dfc5da6f187eea0b49a68bb2b69db0f30a61b7031cee4",
-                "sha256:1fd5066cd343b5db88c048d971994e56b296868766e461b82fa4e22498f34d77",
-                "sha256:29c9569049d04aaacd690573a0398dbd8e0bf0255684fee512b413c2142ab723",
-                "sha256:2b6dfa068a8b6137da34a4936f5a816aba0ecc967af2feeb32c4393ddd671cba",
-                "sha256:2cac53839bfc5cece8fdbe7f084d5e3ee61e1303cccc86511d351adcb9e2c792",
-                "sha256:2ee77c14a0299d0541d26f3d8500bb57e081233e3fa915fa35abd02c51fa7fae",
-                "sha256:37730f6e68bdc6a3f02d2079c34c532330d206429f3cee651aab6b66839a9f0e",
-                "sha256:3f08bd8d785204149b5b33e3b5f0ebbfe2190ea58d1a051c578e29e39bfd2367",
-                "sha256:479ab11cbd69612acefa8286481f65c5dece2002ffaa4f9db62682379ca3bb77",
-                "sha256:4bc3c7ef940eeb200ca65bd83005eb3aae8083d47e8fcbf5f0943baa50726856",
-                "sha256:660a87085925c61a0dcc80efb967512ac34dbb256ff7dd2b9b4ee8dbdab58cf4",
-                "sha256:67b3666b544b953a2777cb3f5a922e991be73ab32635666ee72e05876b8a92de",
-                "sha256:70af7d222df0ff81a2da601fab42decb009dc721545ed78549cb96e3a1c5f0c8",
-                "sha256:75e09042a3b39e0ea61ce37e941221313d51a9c26b8e54e12b3ececccb71718a",
-                "sha256:8960a8a9f4598974e4c2aeb1bff9bdd5db03ee65fd1fce8adf3223721aa2a636",
-                "sha256:9364c81b252d8348e9cc0cb63e856b8f7c1b340caba6ee7a7a65c968312f7dab",
-                "sha256:969cc558cca859cadf24f890fc009e1bce7d7d0386ba7c0478641a60199adf79",
-                "sha256:9a211b663cf2314edbdb4cf897beeb5c9ee3810d1d53f0e423f06d6ebbf9cd5d",
-                "sha256:a17ca41f45cf78c2216ebfab03add7cc350c305c38ff34ef4eef66b7d76c5229",
-                "sha256:a2f381932dca2cf775811a008aa3027671ace723b7a38838045b1aee8669fdcf",
-                "sha256:a4eef1ff2d62676deabf076f963eda4da34b51bc0517c70239fafed1d5b51500",
-                "sha256:c088a000dfdd88c184cc7271bfac8c5b82d9efa8637cd2b68183771e3cf56f04",
-                "sha256:c0e0550a404c69aab1e04ae89cca3e2a042b56ab043f7f729d984bf73ed2a093",
-                "sha256:c11003197f908878164f0e6da15fce22373ac3fc320cda8c9d16e6bba105b844",
-                "sha256:c2a5ff58751670292b406b9f06e07ed1446a4b13ffced6b6cab75b857485cbc8",
-                "sha256:c35d09db702f4185ba22bb33ef1751ad49c266534339a5cebeb5159d364f6f82",
-                "sha256:c379425c2707078dfb6bfad2430728831d399dc95a7deeb92015eb4c92345eaf",
-                "sha256:cc866706d56bd3a7dbf8bac8660c6f6462f2f2b8a49add2ba617bc0c54473d83",
-                "sha256:d0da39795049a9afcaadec532e7b669b5ebbb2a9134576ebcc15dd5bdae33cc0",
-                "sha256:f156d6ecfc747ee111c167f8faf5f4953761b5e66e91a4e6767e548d0f80129c",
-                "sha256:f4ebde71785f8bceb39dcd1e7f06bcc5d5c3cf48b9f69ab52636309387b097c8",
-                "sha256:fc214a6b75d2e0ea7745488da7da3c381f41790812988c7a92345978414fad37",
-                "sha256:fd7eef578f5b2200d066db1b50c4aa66410786201669fb76d5238b007918fb24",
-                "sha256:ff04c373477723430dce2e9d024c708a047d44cf17166bf16e604b379bf0ca14"
+                "sha256:0412516dcc9de9b0a1e0ae25a280015809de8270f134cc2c1e32c4eeb397cf30",
+                "sha256:04835e68ef12904bc3e1fd002b33eea0779320d4346082bd5b24bec12ad9c3e9",
+                "sha256:06d1adaa284696785375fa80a6a8eb309be722cf4ef8949518beb34487a3df71",
+                "sha256:085a90a99404b859a4b6c3daa42afde17cb3ad3115e44a75f0d7b4a32f06a6c9",
+                "sha256:0b9911ec70731711c3b6ebcde26caea620cbdd9dcb73c67b0730c8817f24711b",
+                "sha256:10e00f7336780ca7d3653cf3ac26f068fa11b5a96894ea29a64d3dc4b810d630",
+                "sha256:11c27e74bab423eb3c9232d97553111cc0be81b74b47165f07ebfdd29d825875",
+                "sha256:11eb7f98165d56042545c9e6db3ce394ed8b45089a67124298f0473b29cb60b2",
+                "sha256:13654b521fb98abdecec105ea3fb5ba863d1548c9b58831dd5105bb3873569f1",
+                "sha256:15ccb81a6ffc57ea0137f9f3ac2737ffa1d11f786244d719639df17476d399a7",
+                "sha256:18a07a683805d32826c09acfce44a90bf474e6a66ce482b1c7fcd3757d588df3",
+                "sha256:19ec4cfe4b961edc249b0e04b5618666c23a83bc35842dea2bfd5dfa0157f81b",
+                "sha256:1c3ff00110835bdda2b1e2b07f4a2548a39744bb7de5946dc8e95517c4fb2ca6",
+                "sha256:27a330bf7014ee034046db43ccbb05c766aa9e70b8d6c5260bfc38d73103b0ba",
+                "sha256:2b11c9d310a3522b0fd3c35667914271f570576a0e387701f370eb39d45f08a4",
+                "sha256:2c661542c6f71dfd9dc82d9d29a8386287e82813b0375b3a02983feac69ef864",
+                "sha256:2cde7a4d3687f21cffdf5bb171172070bb95e02af448c4c8b2f223d783214056",
+                "sha256:2d5e9dc0bf1b5d9048a94c48d0813b6c96fccfa4ccf276d9c36308840f40c228",
+                "sha256:2f23b2d3079522fdf3c09de6517f625f7a964f916c956527bed805ac043799b8",
+                "sha256:35d27687f027ad25a8d0ef45dd5208ef044c588003cdcedf05afb00dbc5c2deb",
+                "sha256:35d409030bf3bd05fa66fb5fdedc39c521b397f61ad04309c90444e893d05f7d",
+                "sha256:4326ea1e2722f3dc00ed77c36d3b5354b8fb7399fb59230249ea6d59cbed90da",
+                "sha256:4abc247b31a98f29e5224f2d31ef15f86a71f79c7f4d2ac345a5d551d6393073",
+                "sha256:4d89a2e9219a526401015153c0e9dd48319ea6ab9fe3b066a20aa9aee23d9fd3",
+                "sha256:4e59e99fd680e2b8b11bbd463f3c9450ab799305d5f2bafb74fefba6ac058616",
+                "sha256:548794f99ff52a73a156771a0402f5e1c35285bd981046a502d7e4793e8facaa",
+                "sha256:56fd98c8294f57636084f4b076b75f86c57b2a63a8410c0cd172bc93695ee979",
+                "sha256:59697568a0455764a094585b2551fd76bfd6b959c9f92d4bdec9d0e14616303a",
+                "sha256:6bff50ba9891be0a004ef48828e012babaaf7da204d81ab9be37480b9020a82b",
+                "sha256:6cb3dd7f23b044b0737317f892d399f9e2f0b3a02b22b2c692851fb8120d82c6",
+                "sha256:7dbfbc0020aa1d9bc1b0b8bcf255a7d73f4ad0336f8fd2533fcc54a4ccfb9441",
+                "sha256:838eb85de6d9307c19c655c726f8d13b8b646f144ca6b3771fa62b711ebf7624",
+                "sha256:8b68f565a4175e12e68ca900af8910e8fe48aaa48fd3ca853494f384e11c8bcd",
+                "sha256:8f284dc1695caf71a74f24993b7c7473d77bc760be45f776a2c2f4e04c170550",
+                "sha256:963ebdc5365d748185fdb06daf2ac758116deecb2277ec5ae98139f93844bc09",
+                "sha256:a048dad5ed6ad1fad338c02c609b862dfaa921fcd065d747194a6805f91f2196",
+                "sha256:a1bd983c565f92779be456ece2479840ec39d386007cd4ae83382646293d681b",
+                "sha256:a66566f8a22561fc1a88dc87606c69b84fa9ce724f99522cf922c801ec68f5c1",
+                "sha256:bcb04ff12e79b28be6c9988f275e7ab69f01cc2ba319fb3114f87817bb7c74b6",
+                "sha256:bd24054aaf21e70a51e2a2a5ed1183560d3a69e6f9594a4bfe360a46f94eba83",
+                "sha256:be25cb93442c6d2f8702c599b51184bd3ccd83adebd08886b682173e09ef0c3f",
+                "sha256:c691b26283c3a31594683217d746f1dad59a7ae1d4cfc24626d7a064a11197d4",
+                "sha256:cc9d0dec711c914ed500f1d0d3822868760954dce98dfb0b7382a854aee55d19",
+                "sha256:ce2e5e04bb86da6187f96d7bab3f93a7877830981b37f0287dd6479e27a10341",
+                "sha256:ce651ca46d0202c302a535d3047c55a0131a720cf554a578fc1b8a2aff0e7d96",
+                "sha256:d0c8ebbfd439c37624db98f3877d9ed12c137cadd99dde2d2eae0dab0bbfc355",
+                "sha256:d675a876b295afa114ca8bf42d7f86b5fb1298e1b6bb9a24405a3f6c8338811c",
+                "sha256:dde3f3ed8d00c72631bc19cbfff8ad3b6215062a5eed402381ad365f82f0c18c",
+                "sha256:e5a31c07cea5edbaeb4bdba6f2b87db7d3dc0f446f379d907e51cc70ea375629",
+                "sha256:f514c2717012859ccb349c97862568fdc0479aad85b0270d6b5a6509dbc142e2",
+                "sha256:fc0db32f7223b094964e71729c0361f93db43664dd1ec86d3df217853cedda87",
+                "sha256:fd4fd83aa912d7b89b4b4a1580d30e2a4242f3936882a3f433586e5ab97ed0d5",
+                "sha256:feb5db446e96bfecfec078b943cc07744cc759893cef045aa8b8b6d6aaa8274e"
            ],
            "markers": "python_version >= '3.6'",
-            "version": "==8.3.1"
+            "version": "==8.3.2"
        },
        "pycparser": {
            "hashes": [
@@ -545,11 +631,11 @@
        },
        "typing-extensions": {
            "hashes": [
-                "sha256:0ac0f89795dd19de6b97debb0c6af1c70987fd80a2d62d1958f7e56fcc31b497",
-                "sha256:50b6f157849174217d0656f99dc82fe932884fb250826c18350e159ec6cdf342",
-                "sha256:779383f6086d90c99ae41cf0ff39aac8a7937a9283ce0a414e5dd782f4c94a84"
+                "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
+                "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
+                "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
            ],
-            "version": "==3.10.0.0"
+            "version": "==3.10.0.2"
        },
        "typing-utils": {
            "hashes": [
@@ -561,19 +647,19 @@
        },
        "urllib3": {
            "hashes": [
-                "sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4",
-                "sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f"
+                "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece",
+                "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"
            ],
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
-            "version": "==1.26.6"
+            "version": "==1.26.7"
        },
        "werkzeug": {
            "hashes": [
-                "sha256:1de1db30d010ff1af14a009224ec49ab2329ad2cde454c8a708130642d579c42",
-                "sha256:6c1ec500dcdba0baa27600f6a22f6333d8b662d22027ff9f6202e3367413caa8"
+                "sha256:63d3dc1cf60e7b7e35e97fa9861f7397283b75d765afcaefd993d6046899de8f",
+                "sha256:aa2bb6fc8dee8d6c504c0ac1e7f5f7dc5810a9903e793b6f715a9f015bdadb9a"
            ],
            "markers": "python_version >= '3.6'",
-            "version": "==2.0.1"
+            "version": "==2.0.2"
        }
    },
    "develop": {}
--- a/backend/pine/backend/collections/bp.py
+++ b/backend/pine/backend/collections/bp.py
@@ -16,6 +16,7 @@ from werkzeug import exceptions

 from .. import auth, log, models
 from ..data import service
+from ..documents import sanitize_document

 bp = Blueprint("collections", __name__, url_prefix = "/collections")
 LOGGER = logging.getLogger(__name__)
@@ -411,7 +412,9 @@ def get_overlap_ids(collection_id: str):


 def _upload_documents(collection, docs):
-    doc_resp = service.post("/documents", json=docs)
+    for doc in docs:
+        sanitize_document(doc)
+    doc_resp = service.post("documents", json=docs)
    # TODO if it failed, roll back the created collection and classifier
    if not doc_resp.ok:
        abort(doc_resp.status_code, doc_resp.content)
--- a/backend/pine/backend/documents/init.py
+++ b/backend/pine/backend/documents/init.py
@@ -1,3 +1,3 @@
 # (C) 2019 The Johns Hopkins University Applied Physics Laboratory LLC.

-from .bp import get_collection_ids_for, get_user_permissions, get_user_permissions_by_id, get_user_permissions_by_ids
+from .bp import get_collection_ids_for, get_user_permissions, get_user_permissions_by_id, get_user_permissions_by_ids, sanitize_document
--- a/backend/pine/backend/documents/bp.py
+++ b/backend/pine/backend/documents/bp.py
@@ -6,6 +6,7 @@ import re
 import typing

 from flask import abort, Blueprint, jsonify, request
+import lxml.html.clean
 from werkzeug import exceptions

 from .. import auth, collections, log, models
@@ -13,6 +14,19 @@ from ..data import service

 bp = Blueprint("documents", __name__, url_prefix = "/documents")

+HTML_CLEANER = lxml.html.clean.Cleaner(
+    page_structure=True, # keep body only
+    links=True, # remove <link> (not <a>)
+    safe_attrs_only=True, # strip out non-standard attributes
+    style=False, # leave <style>
+    javascript=True, # no javascript!
+    scripts=True, # no javascript!!
+    meta=True, # strip out <meta>
+    forms=True, # strip out forms
+    embedded=True, # strip out embedded flash, etc.,
+    kill_tags=["title"] # otherwise the title gets embedded at the top
+)
+
 def _document_user_can_projection():
    return service.params({"projection": {
        "collection_id": 1
@@ -40,6 +54,9 @@ def get_user_permissions_by_id(document_id: str) -> models.CollectionUserPermiss
 def get_user_permissions_by_ids(document_ids: typing.Iterable[str]) -> typing.List[models.CollectionUserPermissions]:
    return collections.get_user_permissions_by_ids(get_collection_ids_for(document_ids))

+def sanitize_document(document: dict):
+    if document and "metadata" in document and "html_view" in document["metadata"]:
+        document["metadata"]["html_view"] = HTML_CLEANER.clean_html(document["metadata"]["html_view"])

@bp.route("/by_id/<doc_id>", methods = ["GET"])
@auth.login_required
@@ -250,6 +267,9 @@ def add_document():
        if "has_annotated" not in doc:
            doc["has_annotated"] = {user_id: False for user_id in collections_by_id[doc["collection_id"]]["annotators"]}

+        # sanitize
+        sanitize_document(doc)
+
    # Add document(s) to database
    doc_resp = service.post("documents", json=docs)
    if not doc_resp.ok:
--- a/backend/pine/backend/shared/config.py
+++ b/backend/pine/backend/shared/config.py
@@ -91,6 +91,15 @@ class BaseConfig(object):
                framework="spacy",
                types=["fit", "predict", "status"]
            )
+        ),
+        dict(
+            name="service_simpletransformers",
+            version="1.0",
+            channel="service_simpletransformers",
+            service=dict(
+                framework="simpletransformers",
+                types=["fit", "predict", "status"]
+            )
        )
    ]

--- a/frontend/annotation/angular.json
+++ b/frontend/annotation/angular.json
@@ -26,7 +26,7 @@
              "src/assets"
            ],
            "styles": [
-              "src/styles.css",
+              "src/styles.scss",
              "src/themes.scss"
            ],
            "scripts": [
--- a/frontend/annotation/package-lock.json
+++ b/frontend/annotation/package-lock.json
@@ -6936,7 +6936,8 @@
    "ini": {
      "version": "1.3.8",
      "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz",
-      "integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew=="
+      "integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==",
+      "dev": true
    },
    "inquirer": {
      "version": "8.1.2",
--- a/frontend/annotation/src/app/component/annotate/annotate.component.css
+++ b/frontend/annotation/src/app/component/annotate/annotate.component.css
@@ -204,52 +204,6 @@ mat-expansion-panel-header {
    box-shadow: 2px 2px 2px grey;
 }

-.annotation, .select {
-	-moz-box-shadow: 2px 2px 2px grey;
-    -webkit-box-shadow: 2px 2px 2px grey;
-    box-shadow: 2px 2px 2px grey;
-    
-    border-top: 1px solid black;
-    border-bottom: 1px solid black;
-}
-
-.select {
-	background: white !important;
-}
-
-.annotationLeft, .selectLeft {
-	padding-left: 10px;
-
-    border-left: 1px solid black;
-    
-    -moz-border-top-left-radius: 20px;
-    border-top-left-radius: 20px;
-    -moz-border-bottom-left-radius: 20px;
-    border-bottom-left-radius: 20px;
-}
-
-.annotationRight, .selectRight {
-	padding-right: 10px;
-	margin-right: 2px;
-    
-    border-right: 1px solid black;
-    
-    border-top-right-radius: 20px;
-    border-bottom-right-radius: 20px;
-}
-
-.select {
-	
-}
-
-.selectLeft {
-
-}
-
-.selectRight {
-
-}
-
 .doc-label-list {
    align-items: center;
 }
--- a/frontend/annotation/src/app/component/annotate/annotate.component.html
+++ b/frontend/annotation/src/app/component/annotate/annotate.component.html
@@ -1,211 +1,217 @@
 <!-- (C) 2019 The Johns Hopkins University Applied Physics Laboratory LLC.-->
 <div fxFlexFill class="page-container" fxLayout="column">
-    <mat-toolbar>
-        <button class="doc-back-button" mat-icon-button matTooltip="Go back to collection details"
-            (click)="backToCollectionDetails()">
-            <mat-icon>keyboard_arrow_left</mat-icon>
-        </button>
-        <span class="page-title">Document {{doc?._id}}</span>
-        <span fxFlex="8px"></span>
-        <button class="title-toolbar-button" mat-stroked-button (click)="scroll('detailsFlag')">Details</button>
-        <span fxFlex="8px"></span>
-        <button class="title-toolbar-button" mat-stroked-button (click)="scroll('docAnnotateFlag')">Labeling</button>
-        <span fxFlex="8px"></span>
-        <button class="title-toolbar-button" mat-stroked-button (click)="scroll('imageFlag')">Image</button>
-        <span fxFlex="8px"></span>
-        <button class="title-toolbar-button" mat-stroked-button (click)="scroll('documentFlag')">Document</button>
-        <span fxFlex></span>
-        <button class="annotate-button" mat-raised-button (click)="save(false)">
-            <span class="material-icons">save</span>Save
-        </button>
-        <span fxFlex="10px"></span>
-        <button mat-raised-button (click)="save(true)">
-            <span class="material-icons">skip_next</span>
-            Save and Go to Next Document
-        </button>
-    </mat-toolbar>
+	<mat-toolbar>
+		<button class="doc-back-button" mat-icon-button matTooltip="Go back to collection details"
+			(click)="backToCollectionDetails()">
+			<mat-icon>keyboard_arrow_left</mat-icon>
+		</button>
+		<span class="page-title">Document {{doc?._id}}</span>
+		<span fxFlex="8px"></span>
+		<button class="title-toolbar-button" mat-stroked-button (click)="scroll('detailsFlag')">Details</button>
+		<span fxFlex="8px"></span>
+		<button class="title-toolbar-button" mat-stroked-button (click)="scroll('docAnnotateFlag')">Labeling</button>
+		<span fxFlex="8px"></span>
+		<button class="title-toolbar-button" mat-stroked-button (click)="scroll('imageFlag')">Image</button>
+		<span fxFlex="8px"></span>
+		<button class="title-toolbar-button" mat-stroked-button (click)="scroll('documentFlag')">Document</button>
+		<span fxFlex></span>
+		<button class="annotate-button" mat-raised-button (click)="save(false)">
+			<span class="material-icons">save</span>Save
+		</button>
+		<span fxFlex="10px"></span>
+		<button mat-raised-button (click)="save(true)">
+			<span class="material-icons">skip_next</span>
+			Save and Go to Next Document
+		</button>
+	</mat-toolbar>

-    <div class="page-content" id="page-content" #pageContent>
+	<div class="page-content" id="page-content" #pageContent>

-        <app-loading></app-loading>
+		<app-loading></app-loading>

-        <mat-accordion *ngIf="!loading.loading && !loading.error" [multi]="true" displayMode="flat">
-            <mat-expansion-panel class="mat-elevation-z0" id="detailsFlag" [expanded]="panelExpanded.detailsFlag"
-                (closed)="panelIsOpen('detailsFlag', false)" (opened)="panelIsOpen('detailsFlag', true)"
-                (afterExpand)="onAfterExpand('detailsFlag')">
-                <mat-expansion-panel-header>
-                    <mat-panel-title>Document Details</mat-panel-title>
-                </mat-expansion-panel-header>
+		<mat-accordion *ngIf="!loading.loading && !loading.error" [multi]="true" displayMode="flat">
+			<mat-expansion-panel class="mat-elevation-z0" id="detailsFlag" [expanded]="panelExpanded.detailsFlag"
+				(closed)="panelIsOpen('detailsFlag', false)" (opened)="panelIsOpen('detailsFlag', true)"
+				(afterExpand)="onAfterExpand('detailsFlag')">
+				<mat-expansion-panel-header>
+					<mat-panel-title>Document Details</mat-panel-title>
+				</mat-expansion-panel-header>

-                <app-document-details expanded="true" [document]="doc" [collection]="collection"
-                    (imageUrlChanged)="imageChanged($event)">
-                </app-document-details>
-            </mat-expansion-panel>
+				<app-document-details expanded="true" [document]="doc" [collection]="collection"
+					(imageUrlChanged)="imageChanged($event)">
+				</app-document-details>
+			</mat-expansion-panel>

-            <mat-expansion-panel class="mat-elevation-z0" id="docAnnotateFlag" [expanded]="panelExpanded.docAnnotateFlag"
-                (closed)="panelIsOpen('docAnnotateFlag', false)" (opened)="panelIsOpen('docAnnotateFlag', true)"
-                (afterExpand)="onAfterExpand('docAnnotateFlag')">
-                <mat-expansion-panel-header>
-                    <mat-panel-title>Document Labeling</mat-panel-title>
-                </mat-expansion-panel-header>
+			<mat-expansion-panel class="mat-elevation-z0" id="docAnnotateFlag"
+				[expanded]="panelExpanded.docAnnotateFlag" (closed)="panelIsOpen('docAnnotateFlag', false)"
+				(opened)="panelIsOpen('docAnnotateFlag', true)" (afterExpand)="onAfterExpand('docAnnotateFlag')">
+				<mat-expansion-panel-header>
+					<mat-panel-title>Document Labeling</mat-panel-title>
+				</mat-expansion-panel-header>

-                <div class="doc-labeling-container">
-                    <div fxLayout="row">
-                        <mat-error *ngIf="!permissions.annotate" id="cantAnnotate">
-                            <h3>Note: you do not have authority to change or add annotations for this document.</h3>
-                        </mat-error>
-                    </div>
+				<div class="doc-labeling-container">
+					<div fxLayout="row">
+						<mat-error *ngIf="!permissions.annotate" id="cantAnnotate">
+							<h3>Note: you do not have authority to change or add annotations for this document.</h3>
+						</mat-error>
+					</div>

-                    <div class="doc-label-list" fxLayout="row">
-                        <mat-chip-list fxFlex>
-                            <mat-checkbox *ngFor="let annotation of myDocAnnotations;" [(ngModel)]="annotation.checked"
-                            style="padding-right: 30px">
-                                <mat-chip [style.background-color]="annotation.label.color"
-                                    class="shadowed cursor-pointer">
-                                    {{annotation.label.name}}</mat-chip>
-                            </mat-checkbox>
-                        </mat-chip-list>
-                    </div>
-                </div>
-            </mat-expansion-panel>
+					<div class="doc-label-list" fxLayout="row">
+						<mat-chip-list fxFlex>
+							<mat-checkbox *ngFor="let annotation of myDocAnnotations;" [(ngModel)]="annotation.checked"
+								style="padding-right: 30px">
+								<mat-chip [style.background-color]="annotation.label.color"
+									class="shadowed cursor-pointer">
+									{{annotation.label.name}}</mat-chip>
+							</mat-checkbox>
+						</mat-chip-list>
+					</div>
+				</div>
+			</mat-expansion-panel>

-            <mat-expansion-panel class="mat-elevation-z0" id="imageFlag" class="no-padding" [expanded]="panelExpanded.imageFlag"
-                (closed)="panelIsOpen('imageFlag', false)" (opened)="panelIsOpen('imageFlag', true)"
-                (afterExpand)="onAfterExpand('imageFlag')">
-                <mat-expansion-panel-header>
-                    <mat-panel-title>Image</mat-panel-title>
-                </mat-expansion-panel-header>
+			<mat-expansion-panel class="mat-elevation-z0" id="imageFlag" class="no-padding"
+				[expanded]="panelExpanded.imageFlag" (closed)="panelIsOpen('imageFlag', false)"
+				(opened)="panelIsOpen('imageFlag', true)" (afterExpand)="onAfterExpand('imageFlag')">
+				<mat-expansion-panel-header>
+					<mat-panel-title>Image</mat-panel-title>
+				</mat-expansion-panel-header>

-                <div *ngIf="doc.metadata && doc.metadata['imageUrl']" id="myDocImage" class="image-container"
-                    [ngStyle]="{'height': (pageHeight - 48 - 127) + 'px'}" #imageContainer>
-                    <div
-                        style="position: absolute; top: 0px; bottom: 20px; left: 20px; right: 20px; background-color: lightgray">
-                        <button class="full-screen-btn" mat-raised-button (click)="toggleImageFullscreen()">{{
-                            isImageFullscreen() ? 'Close' : 'Open' }} Full
-                            Screen</button>
-                        <app-image-explorer [imageUrl]="doc.metadata['imageUrl']" [documentId]="doc._id"
-                            [collectionId]="collection._id"></app-image-explorer>
-                    </div>
-                </div>
-            </mat-expansion-panel>
+				<div *ngIf="doc.metadata && doc.metadata['imageUrl']" id="myDocImage" class="image-container"
+					[ngStyle]="{'height': (pageHeight - 48 - 127) + 'px'}" #imageContainer>
+					<div
+						style="position: absolute; top: 0px; bottom: 20px; left: 20px; right: 20px; background-color: lightgray">
+						<button class="full-screen-btn" mat-raised-button (click)="toggleImageFullscreen()">{{
+							isImageFullscreen() ? 'Close' : 'Open' }} Full
+							Screen</button>
+						<app-image-explorer [imageUrl]="doc.metadata['imageUrl']" [documentId]="doc._id"
+							[collectionId]="collection._id"></app-image-explorer>
+					</div>
+				</div>
+			</mat-expansion-panel>

-            <mat-expansion-panel class="mat-elevation-z0" id="documentFlag" class="no-padding" [expanded]="panelExpanded.documentFlag"
-                (closed)="panelIsOpen('documentFlag', false)" (opened)="panelIsOpen('documentFlag', true)"
-                (afterExpand)="onAfterExpand('documentFlag')">
-                <mat-expansion-panel-header>
-                    <mat-panel-title>Document</mat-panel-title>
-                </mat-expansion-panel-header>
+			<mat-expansion-panel class="mat-elevation-z0" id="documentFlag" class="no-padding"
+				[expanded]="panelExpanded.documentFlag" (closed)="panelIsOpen('documentFlag', false)"
+				(opened)="panelIsOpen('documentFlag', true)" (afterExpand)="onAfterExpand('documentFlag')">
+				<mat-expansion-panel-header>
+					<mat-panel-title>Document</mat-panel-title>
+				</mat-expansion-panel-header>

-                <div class="doc-content-container" [ngStyle]="{'height': (pageHeight - 48 - 127) + 'px'}">
-                    <div class="filter-bar">
-                        <button mat-icon-button (click)="showList = !showList">
-                            <mat-icon>list</mat-icon>
-                        </button>
+				<div class="doc-content-container" [ngStyle]="{'height': (pageHeight - 48 - 127) + 'px'}">
+					<div class="filter-bar">
+						<button mat-icon-button (click)="showList = !showList">
+							<mat-icon>list</mat-icon>
+						</button>

-                        <span fxFlex="22px"></span>
+						<span fxFlex="22px"></span>

-                        <span *ngIf="others.length === 0">No annotations from other users.</span>
-                        <div *ngIf="others.length > 0" id="others">
-                            <mat-form-field fxFlex="180px" floatLabel="never">
-                                <mat-label>Show Annotations:</mat-label>
-                                <mat-select id="othersAnnotations" value="" #othersSelect>
-                                    <mat-option value="" (click)="showAnnotationsOf(othersSelect, null)">
-                                        Mine
-                                    </mat-option>
-                                    <mat-option *ngFor="let other of others" [value]="other"
-                                        (click)="showAnnotationsOf(othersSelect, other)">
-                                        {{ auth.getUserDisplayName(other) }}</mat-option>
-                                </mat-select>
-                            </mat-form-field>
-                            <span fxFlex="10px"></span>
-                            <mat-chip-list
-                                *ngIf="othersSelect.value && othersDocAnnotations.hasOwnProperty(othersSelect.value) && othersDocAnnotations[othersSelect.value].length > 0">
-                                <mat-chip *ngFor="let label of othersDocAnnotations[othersSelect.value]"
-                                    [style.background-color]="getColorFor(label)">
-                                    {{label}}
-                                </mat-chip>
-                            </mat-chip-list>
-                            <span
-                                *ngIf="othersSelect.value && (!othersDocAnnotations.hasOwnProperty(othersSelect.value) || othersDocAnnotations[othersSelect.value].length === 0)">No
-                                labels for this document.</span>
-                        </div>
-                        <span fxFlex></span>
-                        <div>
-                            <span>
-                                <b>
-                                    Document Overall Agreement:
-                                </b>
-                                <span *ngIf="ann_agreement != null && ann_agreement != 'null'">{{ann_agreement |
-                                    percent:'1.2-2'}}</span>
-                                <span *ngIf="ann_agreement == null || ann_agreement == 'null'">N/A</span>
-                            </span>
-                        </div>
-                    </div>
+						<span *ngIf="others.length === 0">No annotations from other users.</span>
+						<div *ngIf="others.length > 0" id="others">
+							<mat-form-field fxFlex="180px" floatLabel="never">
+								<mat-label>Show Annotations:</mat-label>
+								<mat-select id="othersAnnotations" value="" #othersSelect>
+									<mat-option value="" (click)="showAnnotationsOf(othersSelect, null)">
+										Mine
+									</mat-option>
+									<mat-option *ngFor="let other of others" [value]="other"
+										(click)="showAnnotationsOf(othersSelect, other)">
+										{{ auth.getUserDisplayName(other) }}</mat-option>
+								</mat-select>
+							</mat-form-field>
+							<span fxFlex="10px"></span>
+							<mat-chip-list
+								*ngIf="othersSelect.value && othersDocAnnotations.hasOwnProperty(othersSelect.value) && othersDocAnnotations[othersSelect.value].length > 0">
+								<mat-chip *ngFor="let label of othersDocAnnotations[othersSelect.value]"
+									[style.background-color]="getColorFor(label)">
+									{{label}}
+								</mat-chip>
+							</mat-chip-list>
+							<span
+								*ngIf="othersSelect.value && (!othersDocAnnotations.hasOwnProperty(othersSelect.value) || othersDocAnnotations[othersSelect.value].length === 0)">No
+								labels for this document.</span>
+						</div>
+						<span fxFlex></span>
+						<div>
+							<span>
+								<b>
+									Document Overall Agreement:
+								</b>
+								<span *ngIf="ann_agreement != null && ann_agreement != 'null'">{{ann_agreement |
+									percent:'1.2-2'}}</span>
+								<span *ngIf="ann_agreement == null || ann_agreement == 'null'">N/A</span>
+							</span>
+						</div>
+					</div>

-                    <div class="annotate-area">
-                        <div *ngIf="showList" class="annotate-table-container" fxFlex="30%">
-                            <app-ner-annotation-table [labels]="availableLabels" [data]="nerData"
-                                (remove)="removeAnnotation($event)" [readOnly]="showingAnnotationsFor !== null">
-                            </app-ner-annotation-table>
-                        </div>
-                        <div class="annotate-doc-container" fxFlex>
-                            <div class="annotate-doc-toolbar" fxLayout="row">
-                                <span class="mat-title">NER Annotations</span>
-                                <span fxFlex></span>
-                                <span *ngIf="showingAnnotationsFor === null">Click to select text; right-click to
-                                    annotate
-                                    selection</span>
-                                <span *ngIf="showingAnnotationsFor !== null">Showing
-                                    {{ auth.getUserDisplayName(showingAnnotationsFor) }}'s
-                                    annotations in read-only mode</span>
-                                <span fxFlex="10px"></span>
-                                <mat-menu #settingsMenu="matMenu" id="settings">
-                                    <button>
-                                        <mat-checkbox matMenuItem [(ngModel)]="settingMonospace"
-                                            (click)="$event.stopPropagation()" class="mat-menu-item">
-                                            Monospace font
-                                        </mat-checkbox>
-                                    </button>
-                                </mat-menu>
-                                <button mat-icon-button [matMenuTriggerFor]="settingsMenu" id="settingsButton"
-                                    matTooltip="Document/annotation settings">
-                                    <mat-icon>settings</mat-icon>
-                                </button>
-                            </div>
+					<div class="annotate-area">
+						<div *ngIf="showList" class="annotate-table-container" fxFlex="30%">
+							<app-ner-annotation-table [labels]="availableLabels" [data]="nerData"
+								(remove)="removeAnnotation($event)" [readOnly]="showingAnnotationsFor !== null">
+							</app-ner-annotation-table>
+						</div>
+						<div class="annotate-doc-container" fxFlex>
+							<div class="annotate-doc-toolbar" fxLayout="row">
+								<span class="mat-title">NER Annotations</span>
+								<span fxFlex></span>
+								<span *ngIf="showingAnnotationsFor === null">Click to select text; right-click to
+									annotate
+									selection</span>
+								<span *ngIf="showingAnnotationsFor !== null">Showing
+									{{ auth.getUserDisplayName(showingAnnotationsFor) }}'s
+									annotations in read-only mode</span>
+								<span fxFlex="10px"></span>
+								<mat-menu #settingsMenu="matMenu" id="settings">
+									<button>
+										<mat-checkbox matMenuItem [(ngModel)]="settingMonospace"
+											(click)="$event.stopPropagation()" class="mat-menu-item">
+											Monospace font
+										</mat-checkbox>
+									</button>
+								</mat-menu>
+								<button mat-icon-button [matMenuTriggerFor]="settingsMenu" id="settingsButton"
+									matTooltip="Document/annotation settings">
+									<mat-icon>settings</mat-icon>
+								</button>
+							</div>

-                            <div #docElem id="doc" class="cursor-pointer">
-                                <!-- set word-start and word-end to help with testing -->
-                                <span #wordsList class="word" *ngFor="let word of nerData.words" [id]="word.id"
-                                    [attr.word-start]="word.start" [attr.word-end]="word.end"
-                                    [matTooltip]="getWordTooltip(word)" (mousedown)="mousedown($event, word)"
-                                    (mouseover)="mouseover($event, word)" (mouseout)="mouseout($event, word)"
-                                    (mouseup)="mouseup($event, word)" (click)="click($event, word)"
-                                    (contextmenu)="contextMenu($event, word)">{{ word.text }}</span>
-                            </div>
+							<div #docElem id="doc" class="cursor-pointer">
+								<span id="words-html">
+									<div *ngIf="styleHtml" [innerHtml]="styleHtml"></div>
+									<div *ngIf="contentHtml" [innerHtml]="contentHtml"></div>
+								</span>
+								<!-- set word-start and word-end to help with testing -->
+								<ng-container *ngIf="!contentHtml">
+									<span #wordsList class="word" *ngFor="let word of nerData.words" [id]="word.id"
+										[attr.word-start]="word.start" [attr.word-end]="word.end"
+										[matTooltip]="getWordTooltip(word)" (mousedown)="mousedown($event, word)"
+										(mouseover)="mouseover($event, word)" (mouseout)="mouseout($event, word)"
+										(mouseup)="mouseup($event, word)" (click)="click($event, word)"
+										(contextmenu)="contextMenu($event, word)">{{ word.text }}</span>
+								</ng-container>
+							</div>

-                            <div *ngIf="!allowOverlappingNerAnnotations"> (Note: overlapping annotations are not allowed
-                                for
-                                this
-                                collection.)
-                            </div>
+							<div *ngIf="!allowOverlappingNerAnnotations"> (Note: overlapping annotations are not allowed
+								for
+								this
+								collection.)
+							</div>

-                            <div #popoverTemplate id="popoverTemplate" class="popover" hidden>
-                                <mat-chip-list>
-                                    <mat-chip *ngFor="let label of availableLabels"
-                                        [style.background-color]="label.color"
-                                        class="shadowed cursor-pointer doc-label-chip">{{label.name}}</mat-chip>
-                                </mat-chip-list>
-                                <div style="padding: 2px">
-                                    <button mat-raised-button color="warn">
-                                        Remove / Reset
-                                    </button>
-                                </div>
-                            </div>
-                        </div>
-                    </div>
-                </div>
+							<div #popoverTemplate id="popoverTemplate" class="popover" hidden>
+								<mat-chip-list>
+									<mat-chip *ngFor="let label of availableLabels"
+										[style.background-color]="label.color"
+										class="shadowed cursor-pointer doc-label-chip">{{label.name}}</mat-chip>
+								</mat-chip-list>
+								<div style="padding: 2px">
+									<button mat-raised-button color="warn">
+										Remove / Reset
+									</button>
+								</div>
+							</div>
+						</div>
+					</div>
+				</div>

-            </mat-expansion-panel>
-        </mat-accordion>
-    </div>
+			</mat-expansion-panel>
+		</mat-accordion>
+	</div>
 </div>
--- a/frontend/annotation/src/app/component/annotate/annotate.component.ts
+++ b/frontend/annotation/src/app/component/annotate/annotate.component.ts
--- a/frontend/annotation/src/app/component/annotate/ner-data.ts
+++ b/frontend/annotation/src/app/component/annotate/ner-data.ts
@@ -11,6 +11,7 @@ export class NerData {
    public words: Word[];
    public annotations: NerAnnotation[];
    private wordIndices: object;
+	private wordMap: { [id: string]: Word } = {};

    constructor() {
        this.changed = new EventEmitter<NerAnnotation[]>();
@@ -22,7 +23,18 @@ export class NerData {
    public setWordsAndAnnotations(words: Word[], annotations: NerAnnotation[]) {
        this.words = words;
        this.setAnnotations(annotations);
+
+		console.log(this.words);
+
+		this.wordMap = {};
+		for(const word of words) {
+			this.wordMap[word.id] = word;
+		}
    }
+
+	public getWordById(id: string) {
+		return this.wordMap[id];
+	}
    
    public setAnnotations(annotations: NerAnnotation[]) {
        this.annotations = annotations.slice();
--- a/frontend/annotation/src/app/component/annotate/ner-selection.ts
+++ b/frontend/annotation/src/app/component/annotate/ner-selection.ts
@@ -89,6 +89,9 @@ export class NerSelection {
            this.words[0].elem.classList.remove("selectLeft");
            for(let i = this.words[0].index - 1; i >= word.index; i--) {
                const docWord = nerData.words[i];
+				if(!docWord?.elem) {
+					continue;
+				}
                docWord.elem.classList.add("select");
                if(i === word.index) {
                    docWord.elem.classList.add("selectLeft");
@@ -101,6 +104,9 @@ export class NerSelection {
            this.words[this.words.length - 1].elem.classList.remove("selectRight");
            for(let i = this.words[this.words.length - 1].index + 1; i <= word.index; i++) {
                const docWord = nerData.words[i];
+				if(!docWord?.elem) {
+					continue;
+				}
                docWord.elem.classList.add("select");
                if(i === word.index) {
                    docWord.elem.classList.add("selectRight");
--- a/frontend/annotation/src/app/component/document-details/document-details.component.html
+++ b/frontend/annotation/src/app/component/document-details/document-details.component.html
@@ -1,56 +1,62 @@
 <!-- (C) 2019 The Johns Hopkins University Applied Physics Laboratory LLC.-->
 <div class="detail-container">
-    <table class="metadata-table">
-        <tr class="space-under">
-            <td><b>Document ID:</b></td>
-            <td>{{document?._id}}</td>
-        </tr>
-        <tr class="space-under">
-            <td><b>Creation Date:</b></td>
-            <td>{{document?._created}}</td>
-        </tr>
-        <tr class="space-under">
-            <td><b>Last Updated:</b></td>
-            <td>{{document?._updated}}</td>
-        </tr>
-        <tr class="space-under">
-            <td><b>Creator:</b></td>
-            <td>{{auth.getUserDisplayName(document?.creator_id)}}</td>
-        </tr>
-        <tr class="space-under">
-            <td><b>Metadata:</b></td>
-            <td>
-                <table>
-                    <ng-container *ngIf="document && document?.metadata">
-                        <tr *ngFor="let item of document?.metadata | keyvalue">
-                            <td><b>{{item.key}}</b></td>
-                            <td *ngIf="item.key !== 'imageUrl'">{{item.value}}</td>
-                            <td *ngIf="item.key === 'imageUrl'">
-                                <a [href]="collections.collectionImageUrl(collection._id, item.value)" target="_blank">
-                                    {{item.value}}
-                                    <span
-                                        *ngIf="item.value !== collections.collectionImageUrl(collection._id, item.value)">({{collections.collectionImageUrl(collection._id, item.value)}})</span>
-                                </a>
-                                <div><button *ngIf="permissions.modify_document_metadata" mat-button mat-raised-button
-                                        (click)="updateImage()">Update document image</button></div>
-                            </td>
-                        </tr>
-                    </ng-container>
-                    <tr
-                        *ngIf="permissions.modify_document_metadata && (!document || !document.metadata || !document.metadata.hasOwnProperty('imageUrl'))">
-                        <td><b>imageUrl</b></td>
-                        <td><button mat-button mat-raised-button (click)="updateImage()">Update document
-                                image</button></td>
-                    </tr>
-                </table>
-            </td>
-        </tr>
-        <tr>
-            <td><b>Collection:</b></td>
-            <td *ngIf="!collection">Loading...</td>
-            <td *ngIf="collection">{{ collection.hasTitle() ? collection.getTitle() + " (" : "" }}<a href="#"
-                    [routerLink]="['/' + PATHS.collection.details, document?.collection_id]">{{document?.collection_id}}</a>{{ collection.hasTitle() ? ")" : "" }}
-            </td>
-        </tr>
-    </table>
+	<table class="metadata-table">
+		<tr class="space-under">
+			<td><b>Document ID:</b></td>
+			<td>{{document?._id}}</td>
+		</tr>
+		<tr class="space-under">
+			<td><b>Creation Date:</b></td>
+			<td>{{document?._created}}</td>
+		</tr>
+		<tr class="space-under">
+			<td><b>Last Updated:</b></td>
+			<td>{{document?._updated}}</td>
+		</tr>
+		<tr class="space-under">
+			<td><b>Creator:</b></td>
+			<td>{{auth.getUserDisplayName(document?.creator_id)}}</td>
+		</tr>
+		<tr class="space-under">
+			<td><b>Metadata:</b></td>
+			<td>
+				<table>
+					<ng-container *ngIf="document && document?.metadata">
+						<tr *ngFor="let item of document?.metadata | keyvalue">
+							<ng-container *ngIf="item.key != 'html_view'">
+								<td><b>{{item.key}}</b></td>
+								<td *ngIf="item.key !== 'imageUrl'">{{item.value}}</td>
+								<td *ngIf="item.key === 'imageUrl'">
+									<a [href]="collections.collectionImageUrl(collection._id, item.value)"
+										target="_blank">
+										{{item.value}}
+										<span
+											*ngIf="item.value !== collections.collectionImageUrl(collection._id, item.value)">({{collections.collectionImageUrl(collection._id,
+											item.value)}})</span>
+									</a>
+									<div><button *ngIf="permissions.modify_document_metadata" mat-button
+											mat-raised-button (click)="updateImage()">Update document image</button>
+									</div>
+								</td>
+							</ng-container>
+						</tr>
+					</ng-container>
+					<tr
+						*ngIf="permissions.modify_document_metadata && (!document || !document.metadata || !document.metadata.hasOwnProperty('imageUrl'))">
+						<td><b>imageUrl</b></td>
+						<td><button mat-button mat-raised-button (click)="updateImage()">Update document
+								image</button></td>
+					</tr>
+				</table>
+			</td>
+		</tr>
+		<tr>
+			<td><b>Collection:</b></td>
+			<td *ngIf="!collection">Loading...</td>
+			<td *ngIf="collection">{{ collection.hasTitle() ? collection.getTitle() + " (" : "" }}<a href="#"
+					[routerLink]="['/' + PATHS.collection.details, document?.collection_id]">{{document?.collection_id}}</a>{{
+				collection.hasTitle() ? ")" : "" }}
+			</td>
+		</tr>
+	</table>
 </div>
--- a/frontend/annotation/src/app/component/ner-annotation-table/ner-annotation-table.component.ts
+++ b/frontend/annotation/src/app/component/ner-annotation-table/ner-annotation-table.component.ts
@@ -1,6 +1,6 @@
 /*(C) 2019 The Johns Hopkins University Applied Physics Laboratory LLC. */

-import { Component, OnInit, ViewChild, Input, Output, EventEmitter } from '@angular/core';
+import { Component, OnInit, ViewChild, Input, Output, EventEmitter, ChangeDetectorRef } from '@angular/core';
 import { MatPaginator } from '@angular/material/paginator';
 import { MatSort, MatSortable } from '@angular/material/sort';
 import { MatTable, MatTableDataSource } from '@angular/material/table';
@@ -44,7 +44,9 @@ export class NERAnnotationTableComponent implements OnInit {

    public dataSource: MatTableDataSource<NerAnnotation>;

-    constructor() {
+    constructor(
+		private cdr: ChangeDetectorRef
+	) {
        this.dataSource = new MatTableDataSource<NerAnnotation>();
        this.dataSource.filterPredicate = (annotation, value): boolean => {
            if(annotation.label.toLowerCase().includes(value)) {
@@ -60,6 +62,7 @@ export class NERAnnotationTableComponent implements OnInit {
    ngOnInit() {
        this.data.changed.subscribe((res: NerAnnotation[]) => {
            this.dataSource.data = res;
+			this.cdr.detectChanges();
        });
        this.dataSource.sortingDataAccessor = (annotation: NerAnnotation, property: string) => {
            switch(property) {
--- a/frontend/annotation/src/app/model/word.ts
+++ b/frontend/annotation/src/app/model/word.ts
@@ -1,5 +1,6 @@
 // (C) 2019 The Johns Hopkins University Applied Physics Laboratory LLC.

+import * as _ from "lodash";
 import { Observable } from "rxjs";

 export class Word {
@@ -59,5 +60,19 @@ export class Word {
        return words;
    }

+	public static parseWordObjectsFromHtml(elems: HTMLElement[]): Word[] {
+        const words = [];
+		_.forEach(elems, (elem: HTMLElement) => {
+			console.log(elem);
+			let id = elem.getAttribute('ID');
+			let parts = id.split('_');
+			let start = parts[1];
+			//let end = parts[2];
+			let wordObj = new Word(+start, elem.innerHTML, words.length);
+			words.push(wordObj);
+		});
+		return words;
+	}
+
 }

--- a/frontend/annotation/src/styles.scss
+++ b/frontend/annotation/src/styles.scss
@@ -58,3 +58,39 @@ td.space-left {
 .spacer {
  flex: 1 1 auto;
 }
+
+.annotate-area {
+	.annotation, .select {
+		-moz-box-shadow: 2px 2px 2px grey;
+		-webkit-box-shadow: 2px 2px 2px grey;
+		box-shadow: 2px 2px 2px grey;
+		
+		border-top: 1px solid black;
+		border-bottom: 1px solid black;
+	}
+	
+	.select {
+		background: rgba(255,255,255,0.4) !important;
+	}
+	
+	.annotationLeft, .selectLeft {
+		padding-left: 10px;
+	
+		border-left: 1px solid black;
+		
+		-moz-border-top-left-radius: 20px;
+		border-top-left-radius: 20px;
+		-moz-border-bottom-left-radius: 20px;
+		border-bottom-left-radius: 20px;
+	}
+	
+	.annotationRight, .selectRight {
+		padding-right: 10px;
+		margin-right: 2px;
+		
+		border-right: 1px solid black;
+		
+		border-top-right-radius: 20px;
+		border-bottom-right-radius: 20px;
+	}
+}
--- a/pipelines/Dockerfile
+++ b/pipelines/Dockerfile
@@ -76,8 +76,9 @@ WORKDIR ${ROOT_DIR}
 # pipenv causing container to fail to rebuild if spacy installed previously
 #Install python requirements
 COPY Pipfile Pipfile.lock ./
-RUN REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt pipenv install --dev --system --deploy
-
+RUN REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt PIPENV_INSTALL_TIMEOUT=30 \
+    pipenv install --dev --system --deploy
+RUN python3 -m nltk.downloader punkt

 #Copy contents of pipeline folder to docker
 COPY pine ./pine
--- a/pipelines/Pipfile
+++ b/pipelines/Pipfile
@@ -22,6 +22,10 @@ scikit-multilearn = "~=0.2.0"
 python-json-logger = "~=2.0.2"
 overrides = "~=6.1.0"
 typing-extensions = "~=3.10.0.0"
+pandas = "~=1.3.3"
+simpletransformers = "~=0.61.13"
+torch = {file = "https://download.pytorch.org/whl/cpu/torch-1.9.0%2Bcpu-cp38-cp38-linux_x86_64.whl"}
+nltk = "~=3.6.7"

 [requires]
 python_version = "3.8"
--- a/pipelines/Pipfile.lock
+++ b/pipelines/Pipfile.lock
--- a/pipelines/dev_run.sh
+++ b/pipelines/dev_run.sh
@@ -9,8 +9,10 @@ fi

 set -x

+pipenv run python3 -m nltk.downloader punkt
+
 PIDS=""
-for SERVICE in opennlp corenlp spacy; do
+for SERVICE in simpletransformers opennlp corenlp spacy; do
    AL_PIPELINE=${SERVICE} pipenv run python3 -m pine.pipelines.run_service &
    PIDS="${PIDS} $!"
 done
--- a/pipelines/pine/pipelines/EveClient.py
+++ b/pipelines/pine/pipelines/EveClient.py
@@ -10,6 +10,15 @@ from .shared.config import ConfigBuilder
 logger = logging.getLogger(__name__)
 config = ConfigBuilder.get_config()

+class EveDocsAndAnnotations:
+    
+    def __init__(self):
+        self.all_labels: typing.List[str] = []
+        self.documents: typing.List[str] = []
+        self.annotations: typing.List = []
+        self.doc_ids: typing.List[str] = []
+        self.ann_ids: typing.List[str] = []
+
 class EveClient(object):
    eve_headers = {'Content-Type': 'application/json'}

@@ -105,7 +114,7 @@ class EveClient(object):
        }
        return self._get_documents_map(params)

-    def get_docs_with_annotations(self, collection_id: str, doc_map: typing.Dict[str, str]) -> typing.Tuple[typing.List[str], typing.List[str], typing.List[str], typing.List[str]]:
+    def get_docs_with_annotations(self, collection_id: str, doc_map: typing.Dict[str, str]) -> EveDocsAndAnnotations:
        """Gets document and annotation data.  Only non-overlapping documents are returned.
        
        :param collection_id: str: the ID of the collection
@@ -116,10 +125,11 @@ class EveClient(object):
                  ann_ids is a list of the annotation IDs
        :rtype: tuple
        """
-        doc_ids = list()
-        documents = []
-        ann_ids = list()
-        labels = []
+        data = EveDocsAndAnnotations()
+        
+        # get all labels from collection object
+        collection = self.get_obj("collections", collection_id)
+        data.all_labels = collection["labels"]

        #get annotations and make data
        query = 'annotations?where={"collection_id":"%s"}' % (collection_id)
@@ -132,15 +142,15 @@ class EveClient(object):
                # remove overlaps
                if docid not in doc_map:
                    continue
-                doc_ids.append(docid)
-                documents.append(doc_map[docid])
-                ann_ids.append(a["_id"])
-                labels.append(a["annotation"])
+                data.doc_ids.append(docid)
+                data.documents.append(doc_map[docid])
+                data.ann_ids.append(a["_id"])
+                data.annotations.append(a["annotation"])

            if query is None:
                break

-        return documents, labels, doc_ids, ann_ids
+        return data

    def update(self, resource, id, etag, update_obj):
        headers = {'Content-Type': 'application/json', 'If-Match': etag}
--- a/pipelines/pine/pipelines/NER_API.py
+++ b/pipelines/pine/pipelines/NER_API.py
@@ -12,14 +12,31 @@ from skmultilearn.model_selection import IterativeStratification
 from sklearn.preprocessing import MultiLabelBinarizer
 from itertools import chain

-from .EveClient import EveClient
+from .EveClient import EveClient, EveDocsAndAnnotations
 from . import RankingFunctions as rank
+from .pipeline import EvaluationMetrics, StatMetrics
 from .pmap_ner import NER
 from .shared.config import ConfigBuilder

 logger = logging.getLogger(__name__)
 config = ConfigBuilder.get_config()

+class FiveFoldResult(object):
+    
+    def __init__(self):
+        self.metrics: typing.List[EvaluationMetrics] = []
+        # store list of documents ids per fold
+        self.folds: typing.List[typing.List] = []
+        self.average_metrics: typing[dict, StatMetrics] = {}
+    
+    def serialize_metrics(self):
+        return [x.serialize() for x in self.metrics]
+
+    def serialize_folds(self):
+        return list(self.folds) # make a copy
+
+    def serialize_average_metrics(self):
+        return {label: self.average_metrics[label].serialize() for label in self.average_metrics.keys()}

 class ner_api(object):

@@ -43,16 +60,14 @@ class ner_api(object):
            status["has_trained"] = "filename" in classifier_obj
        return status

-    def perform_fold(self, model: NER, train_data, test_data, **pipeline_parameters):
-        model.fit(train_data[0], train_data[1], **pipeline_parameters)
-        results = model.evaluate(test_data[0], test_data[1], range(0, len(test_data[0])))
+    def perform_fold(self, model: NER, all_labels: typing.List[str], train_data, test_data, **pipeline_parameters) -> EvaluationMetrics:
+        model.fit(train_data[0], train_data[1], all_labels, **pipeline_parameters)
+        results = model.evaluate(test_data[0], test_data[1], all_labels)

        return results

-    def perform_five_fold(self, model: NER, documents, annotations, doc_ids, **pipeline_parameters):
-        metrics = list()
-        # store list of documents ids per fold
-        folds = list()
+    def perform_five_fold(self, model: NER, all_labels: typing.List[str], documents, annotations, doc_ids: typing.List[str], **pipeline_parameters) -> FiveFoldResult:
+        results = FiveFoldResult()
        # turning into numpy arrays to be able to access values with index array
        documents_np_array = np.array(documents)
        annotations_np_array = np.array(annotations, dtype=object)
@@ -84,51 +99,39 @@ class ner_api(object):
            train_documents = documents_np_array[train_index]
            test_documents = documents_np_array[test_index]

-            fold_metrics = self.perform_fold(model, [train_documents.tolist(), train_annotations.tolist()],
-                                             [test_documents.tolist(), test_annotations.tolist()], **pipeline_parameters)
+            fold_metrics = self.perform_fold(model, all_labels,
+                                             [train_documents.tolist(), train_annotations.tolist()],
+                                             [test_documents.tolist(), test_annotations.tolist()],
+                                             **pipeline_parameters)

            # saving docs used to train fold
            fold_doc_ids = doc_ids_np_array[train_index]
-            folds.append(fold_doc_ids.tolist())
+            results.folds.append(fold_doc_ids.tolist())

            # saving fold metrics
-            metrics.append(fold_metrics)
+            results.metrics.append(fold_metrics)


-            for key in fold_metrics.keys():
+            for key in fold_metrics.labels.keys():
                if key not in total_metrics:
-                    total_metrics[key] = {"FN": 0, "FP": 0, "TP": 0, "TN": 0, "f1": 0, "precision": 0, "recall": 0, "acc": 0}
-                total_metrics[key]["FN"] = total_metrics[key]["FN"] + fold_metrics[key]["FN"]
-                total_metrics[key]["FP"] = total_metrics[key]["FP"] + fold_metrics[key]["FP"]
-                total_metrics[key]["TP"] = total_metrics[key]["TP"] + fold_metrics[key]["TP"]
-                total_metrics[key]["TN"] = total_metrics[key]["TN"] + fold_metrics[key]["TN"]
+                    total_metrics[key] = StatMetrics()
+                total_metrics[key].fn += fold_metrics.labels[key].fn
+                total_metrics[key].fp += fold_metrics.labels[key].fp
+                total_metrics[key].tp += fold_metrics.labels[key].tp
+                total_metrics[key].tn += fold_metrics.labels[key].tn


-        average_metrics = {}
        for label in total_metrics.keys():
-            avg_metric = {}
-            avg_metric["FN"]  = total_metrics[label]["FN"] / 5
-            avg_metric["FP"]  = total_metrics[label]["FP"] / 5
-            avg_metric["TP"]  = total_metrics[label]["TP"] / 5
-            avg_metric["TN"]  = total_metrics[label]["TN"] / 5
-            if (avg_metric["TP"] + avg_metric["FN"]) != 0:
-                avg_metric["recall"] = avg_metric["TP"] / (avg_metric["TP"] + avg_metric["FN"])
-            else:
-                avg_metric["recall"] = 1.0
-            if (avg_metric["TP"] + avg_metric["FP"]) != 0:
-                avg_metric["precision"]  = avg_metric["TP"] / (avg_metric["TP"] + avg_metric["FP"])
-            else:
-                avg_metric["precision"]  = 0.0
-            if (avg_metric["precision"]  + avg_metric["recall"]) != 0:
-                avg_metric["f1"] = 2 * (avg_metric["precision"] * avg_metric["recall"]) / (avg_metric["precision"] + avg_metric["recall"])
-            else:
-                avg_metric["f1"] = 0
-            avg_metric["acc"] = (avg_metric["TP"] + avg_metric["TN"]) / (avg_metric["TP"] + avg_metric["TN"] + avg_metric["FP"] + avg_metric["FN"])
+            avg_metric = StatMetrics()
+            avg_metric.fn = total_metrics[label].fn / 5
+            avg_metric.fp = total_metrics[label].fp / 5
+            avg_metric.tp = total_metrics[label].tp / 5
+            avg_metric.tn = total_metrics[label].tn / 5
+            avg_metric.calc_precision_recall_f1_acc()

-            average_metrics[label] = avg_metric
+            results.average_metrics[label] = avg_metric

-
-        return metrics, folds, average_metrics
+        return results

    def get_document_ranking(self, model: NER, doc_map: typing.Dict[str, str], doc_ids: typing.List[str]) -> typing.List[str]:
        """Calculates document rankings and returns document IDs sorted by ranking.
@@ -189,19 +192,21 @@ class ner_api(object):
        # get documents where overlap is 0
        doc_map = self.eve_client.get_documents(collection_id)
        # get documents with its annotations where overlap is 0
-        documents, labels, doc_ids, ann_ids = self.eve_client.get_docs_with_annotations(collection_id, doc_map)
+        eve_data = self.eve_client.get_docs_with_annotations(collection_id, doc_map)

        # instantiate model
        classifier = NER(pipeline_name)

        # get folds information
-        metrics, folds, averages = self.perform_five_fold(classifier, documents, labels, doc_ids, **pipeline_parameters)
+        fold_results = self.perform_five_fold(classifier, eve_data.all_labels,
+                                              eve_data.documents, eve_data.annotations,
+                                              eve_data.doc_ids, **pipeline_parameters)

        logger.info("Starting to train classifier for {} pipeline".format(pipeline_name))
-        fit_results = classifier.fit(documents, labels, **pipeline_parameters)
+        fit_results = classifier.fit(eve_data.documents, eve_data.annotations, eve_data.all_labels, **pipeline_parameters)
        results = {
            "fit": fit_results,
-            "average_metrics": averages,
+            "average_metrics": fold_results.serialize_average_metrics(),
            "updated_objects": {}
        }

@@ -221,11 +226,11 @@ class ner_api(object):
        # update classifier metrics on eve
        metrics_updated_obj = {
            'trained_classifier_db_version': classifier_obj['_version']+1,
-            'documents': list(set(chain.from_iterable(folds))),
-            'annotations': list(ann_ids),
-            'folds': list(folds),
-            'metrics': list(metrics),
-            'metric_averages': dict(averages),
+            'documents': list(set(chain.from_iterable(fold_results.folds))),
+            'annotations': list(eve_data.ann_ids),
+            'folds': fold_results.serialize_folds(),
+            'metrics': fold_results.serialize_metrics(),
+            'metric_averages': fold_results.serialize_average_metrics(),
            'filename': filename
        }
        if not self.eve_client.update('metrics', metrics_obj["_id"], metrics_obj['_etag'], metrics_updated_obj):
@@ -234,7 +239,7 @@ class ner_api(object):
        results["updated_objects"]["metrics"] = [metrics_obj["_id"]]

        # re rank documents
-        ranks = self.get_document_ranking(classifier, doc_map, doc_ids)
+        ranks = self.get_document_ranking(classifier, doc_map, eve_data.doc_ids)
        logger.info("Performing document rankings")

        # Save updates to eve
--- a/pipelines/pine/pipelines/corenlp_NER_pipeline.py
+++ b/pipelines/pine/pipelines/corenlp_NER_pipeline.py
@@ -11,7 +11,7 @@ import uuid

 from overrides import overrides

-from .pipeline import Pipeline, NerPrediction, DocumentPredictions, NerPredictionProbabilities, DocumentPredictionProbabilities
+from .pipeline import Pipeline, NerPrediction, DocumentPredictions, NerPredictionProbabilities, DocumentPredictionProbabilities, EvaluationMetrics, StatMetrics
 from .shared.config import ConfigBuilder

 config = ConfigBuilder.get_config()
@@ -161,7 +161,7 @@ class corenlp_NER(Pipeline):
        }

    @overrides
-    def fit(self, X, y, **params) -> dict:
+    def fit(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **params) -> dict:
        default_params = self.__default_fit_params.copy()
        #format input data into tsv file for ner to train on
        try:
@@ -303,7 +303,7 @@ wordShape=""" + default_params["word_shape"] + """

    @overrides
    #TODO
-    def next_example(self, X, Xid):
+    def next_example(self, X: typing.Iterable[str], Xid):
        return

 ## EXTRA METHODS TO HELP WITH THE corenlp PIPELINE ##
@@ -313,7 +313,7 @@ wordShape=""" + default_params["word_shape"] + """
    #Takes input data and formats it to be easier to use in the corenlp pipeline
    #ASSUMES DATA FOLLOWS FORMAT X = [string], y = [[(start offset, stop offset, label), ()], ... []]
    #Currently cannot assign more than one label to the same word
-    def format_data(self, X, y):
+    def format_data(self, X: typing.Iterable[str], y):
        out = []
        for doc,ann in zip(X,y):
            #Extract labeled entities from doc
@@ -352,7 +352,7 @@ wordShape=""" + default_params["word_shape"] + """

    @overrides
    #models must be saved with extension ".ser.gz"
-    def save_model(self, model_name):
+    def save_model(self, model_name: str):
        if not model_name.endswith(".ser.gz"):
            logger.warn('WARNING: model_name must end in .ser.gz, adding...')
            model_name = model_name + ".ser.gz"
@@ -363,7 +363,7 @@ wordShape=""" + default_params["word_shape"] + """

    @overrides
    #properties can be exported/imported during train
-    def load_model(self, model_name):
+    def load_model(self, model_name: str):
        #TODO: what to do if model doesn't exist?
        if not model_name.endswith(".ser.gz"):
            logger.warn('WARNING: model_name must end in .ser.gz, adding...')
@@ -390,31 +390,31 @@ wordShape=""" + default_params["word_shape"] + """

    #Calculates Precision, Recall, and F1 Score for model based on input test data
    #WARNING: currently works for BioNLP data, no guarantees with other datasets
-    def evaluate(self, X, y, Xid, verbose=False):
-
-        known_labels = set()
-        for anns in y:
-            for ann in anns:
-                known_labels.add(ann[2])
-
-        stats = {}
-
+    # WARNING: this is currently broken, but this whole pipeline is broken
+    @overrides
+    def evaluate(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], verbose=False, **kwargs) -> EvaluationMetrics:
        try:
            train_data = self.format_data(X, y)
            if len(train_data) == 0 or train_data is None:
                raise Exception("ERROR: could not format input correctly")
        except:
            raise Exception("ERROR: could not format input correctly")
+        
+        known_labels = set()
+        for anns in y:
+            for ann in anns:
+                known_labels.add(ann[2])
+        
+        metrics = EvaluationMetrics()
        test_text = ''
-
        for doc in X:
            test_text = test_text + doc + '\n\n'
+        
        #rest of code tries to recreate calculations as this line, which can't be called more than once for some reason
        #results = self.__crf.classifyAndWriteAnswers(self.__java_String(self.__test_file), True)
        #print(test_text)
        results = self.__crf.classify(self.__java_String(test_text))

-
        #Calculate evaluation by iterating through answer key and matching tokens to classifier output
        s = 0
        w = 0
@@ -474,7 +474,7 @@ wordShape=""" + default_params["word_shape"] + """
                    #(likely the current answer token doesn't exactly match the guess token, see `` vs '')
                    if i+1 < len(doc):
                        next_gold = doc[i+1]
-                    elif i >= len(doc) and d+1 < len(test_data):
+                    elif i >= len(doc) and d+1 < len(test_data): # this is broken
                        next_gold = test_data[d+1][0]
                    else:
                        next_gold = (None, None)
@@ -488,32 +488,30 @@ wordShape=""" + default_params["word_shape"] + """

                known_labels.add(pred)

-                # Per token metriccs
+                # Per token metrics
                for label in known_labels:
-                    if label not in stats:
-                        stats[label] = [0, 0, 0, 0]
-
-
+                    if label not in metrics.labels:
+                        metrics.labels[label] = StatMetrics()


                if gold == pred and gold != 'O':
-                    stats[gold][0] = stats[gold][0] + 1
+                    metrics.labels[gold].tp += 1
                    for label in known_labels:
                        if label != gold:
-                            stats[label][3] = stats[label][3] + 1
+                            metrics.labels[label].tn += 1
                elif gold == 'O' and pred != 'O':
-                    stats[pred][1] = stats[pred][1] + 1
+                    metrics.labels[pred].fp += 1
                    for label in known_labels:
                        if label != pred:
-                            stats[label][3] = stats[label][3] + 1
+                            metrics.labels[label].tn += 1
                elif pred == 'O' and gold != 'O':
-                    stats[gold][2] = stats[gold][2] + 1
+                    metrics.labels[gold].fn += 1
                    for label in known_labels:
                        if label != gold:
-                            stats[label][3] = stats[label][3] + 1
+                            metrics.labels[label].tn += 1
                else:
                    for label in known_labels:
-                        stats[label][3] = stats[label][3] + 1
+                        metrics.labels[label].tn += 1


                # Per annotation metrics
@@ -555,54 +553,22 @@ wordShape=""" + default_params["word_shape"] + """
        #ONLY USED FOR PER ANNOTATION METRICS
        # del stats['O']

-        TP = 0
-        TN = 0
-        FP = 0
-        FN = 0
-        for key in stats:
-            TP = TP + stats[key][0]
-            FP = FP + stats[key][1]
-            FN = FN + stats[key][2]
-            TN = TN + stats[key][3]
-
-        stats['Totals'] = [TP, FP, FN, TN]
+        for key in metrics.labels:
+            metrics.totals.tp += metrics.labels[key].tp
+            metrics.totals.fp += metrics.labels[key].fp
+            metrics.total.fn += metrics.labels[key].fn
+            metrics.total.tn += metrics.labels[key].tn



        #print(test_data[-1])
-        for key in stats:
-            TP = stats[key][0]
-            FP = stats[key][1]
-            FN = stats[key][2]
-            # Only generated when using per token metrics
-            TN = stats[key][3]
-            if (TP+FN) != 0:
-                recall = TP/(TP+FN)
-            else:
-                recall = 1.0
-            if (TP+FP) != 0:
-                precision = TP/(TP+FP)
-            else:
-                precision = 0.0
-            if (precision + recall) != 0:
-                f1 = 2 * (precision * recall) / (precision + recall)
-            else:
-                f1 = 0
-            # Acc Only works when using per token metrics which generates TN
-            if (TP + FN + FP + TN) != 0:
-                acc = (TP + TN) / (TP + FN + FP + TN)
-            else:
-                acc = 0
-            #Used for annotation metrics
-            # stats[key] = {'precision': precision, 'recall': recall, 'f1': f1, 'TP': TP, 'FP': FP, 'FN': FN}
-            # Used for token metrics
-            stats[key] = {'precision': precision, 'recall': recall, 'f1': f1, 'TP': TP, 'FP': FP, 'FN': FN, 'TN': TN, 'acc': acc}
+        metrics.calc_precision_recall_f1_acc()

-        return stats
+        return metrics

            #Calculates Precision, Recall, and F1 Score for model based on input test data
    #TODO: prints a whole lot to the command line, find a way to suppress?
-    def evaluate_orig(self, X, y, Xid):
+    def evaluate_orig(self, X: typing.Iterable[str], y, Xid):
        try:
            test_data = self.format_data(X, y)
            if len(test_data) == 0 or test_data is None:
--- a/pipelines/pine/pipelines/opennlp_NER_pipeline.py
+++ b/pipelines/pine/pipelines/opennlp_NER_pipeline.py
@@ -13,7 +13,7 @@ import typing
 import pydash
 from overrides import overrides

-from .pipeline import Pipeline, NerPrediction, DocumentPredictions, NerPredictionProbabilities, DocumentPredictionProbabilities
+from .pipeline import Pipeline, NerPrediction, DocumentPredictions, NerPredictionProbabilities, DocumentPredictionProbabilities, EvaluationMetrics, StatMetrics
 from .shared.config import ConfigBuilder

 config = ConfigBuilder.get_config()
@@ -148,14 +148,14 @@ class opennlp_NER(Pipeline):
        }

    @overrides
-    def fit(self, X, y, **params) -> dict:
+    def fit(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **params) -> dict:
        try:
            data = self.format_data(X, y)
            if len(data)==0 or data is None:
                raise Exception("ERROR: could not format input correctly")
        except:
            raise Exception("ERROR: could not format input correctly")
-        #print(data)
+        logger.debug("Formated train data: %s", data)
        with open(self.__train_file, 'w') as f:
            f.write(data)
        inputStreamFactory = self.__java_MarkableFileInputStreamFactory(self.__java_File(self.__java_String(self.__train_file)))
@@ -249,14 +249,14 @@ class opennlp_NER(Pipeline):

    @overrides
    # TODO
-    def next_example(self, X, Xid):
+    def next_example(self, X: typing.Iterable[str], Xid):
        return

 # EXTRA METHODS TO HELP WITH THE opennlp PIPELINE ##

    @overrides
    # models must be saved and loaded with extension ".bin"
-    def save_model(self, model_name):
+    def save_model(self, model_name: str):
        if not model_name.endswith(".bin"):
            logger.warning('WARNING: model_name must end with .bin, adding...')
            model_name = model_name + ".bin"
@@ -266,7 +266,7 @@ class opennlp_NER(Pipeline):


    @overrides
-    def load_model(self, model_name):
+    def load_model(self, model_name: str):
        if not model_name.endswith(".bin"):
            logger.warning('WARNING: model_name must end with .bin, adding...')
            model_name = model_name + ".bin"
@@ -313,7 +313,7 @@ class opennlp_NER(Pipeline):
    #Takes input data and formats it to be easier to use in the opennlp pipeline
    #ASSUMES DATA FOLLOWS FORMAT X = [string], y = [[(start offset, stop offset, label), ()], ... []]
    #Currently cannot assign more than one label to the same word
-    def format_data(self, X, y):
+    def format_data(self, X: typing.Iterable[str], y):
        out = ''
        try:
            for doc, ann in zip(X, y):
@@ -373,13 +373,14 @@ class opennlp_NER(Pipeline):
            labels_per_token.append(labels)
        return labels_per_token

-    def evaluate(self, X, y, Xid):
+    @overrides
+    def evaluate(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **kwargs) -> EvaluationMetrics:
        predictions = self.predict(X)
-        stats = {'Totals': [0, 0, 0, 0]}
+        metrics = EvaluationMetrics()

-        for (doc_id, prediction) in zip(Xid, predictions):
+        for (index, prediction) in enumerate(predictions):
            guesses: typing.List[NerPrediction] = prediction.ner
-            gold = y[Xid.index(doc_id)]
+            gold = y[index]

            all_tokens = prediction.extra_data

@@ -414,47 +415,26 @@ class opennlp_NER(Pipeline):
                    else:
                        TN.append(label)
            for label in all_known_labels:
-                if label not in stats:
-                    stats[label] = [0,0,0,0]
+                if label not in metrics.labels:
+                    metrics.labels[label] = StatMetrics()
            for label in TP:
-                stats[label][0] += 1
-                stats['Totals'][0] += 1
+                metrics.labels[label].tp += 1
+                metrics.totals.tp += 1
            for label in FP:
-                stats[label][1] += 1
-                stats['Totals'][1] += 1
+                metrics.labels[label].fp += 1
+                metrics.totals.fp += 1
            for label in FN:
-                stats[label][2] += 1
-                stats['Totals'][2] += 1
+                metrics.labels[label].fn += 1
+                metrics.totals.fn += 1
            for label in TN:
-                stats[label][3] += 1
-                stats['Totals'][3] += 1
+                metrics.labels[label].tn += 1
+                metrics.totals.tn += 1

-        for key in stats:
-            TP = stats[key][0]
-            FP = stats[key][1]
-            FN = stats[key][2]
-            TN = stats[key][3]
-            if (TP + FN) != 0:
-                recall = TP / (TP + FN)
-            else:
-                recall = 1.0
-            if (TP + FP) != 0:
-                precision = TP / (TP + FP)
-            else:
-                precision = 0.0
-            if (precision + recall) != 0:
-                f1 = 2 * (precision * recall) / (precision + recall)
-            else:
-                f1 = 0
-            if (TP + FN + FP + TN) != 0:
-                acc = (TP + TN) / (TP + FN + FP + TN)
-            else:
-                acc = 0
-            stats[key] = {'precision': precision, 'recall': recall, 'f1': f1, 'TP': TP, 'FP': FP, 'FN': FN, "TN" : TN, "acc": acc}
+        metrics.calc_precision_recall_f1_acc()

-        return stats
+        return metrics

-    def evaluate_orig(self, X, y, Xid):
+    def evaluate_orig(self, X: typing.Iterable[str], y, Xid):
        try:
            data = self.format_data(X, y)
            if len(data) == 0 or data is None:
--- a/pipelines/pine/pipelines/pipeline.py
+++ b/pipelines/pine/pipelines/pipeline.py
@@ -1,8 +1,66 @@
 # (C) 2019 The Johns Hopkins University Applied Physics Laboratory LLC.

 import abc
+import logging
 import typing

+logger = logging.getLogger(__name__)
+
+class StatMetrics(object):
+    
+    def __init__(self, precision: float = None, recall: float = None, f1: float = None,
+                 tp: int = 0, fp: int = 0, fn: int = 0, tn: int = 0, acc: float = None):
+        self.precision = precision
+        self.recall = recall
+        self.f1 = f1
+        self.tp = tp
+        self.fp = fp
+        self.fn = fn
+        self.tn = tn
+        self.acc = acc
+
+    def calc_precision_recall_f1_acc(self):
+        if (self.tp + self.fn) != 0:
+            self.recall = self.tp / (self.tp + self.fn)
+        else:
+            self.recall = 1.0
+        if (self.tp + self.fp) != 0:
+            self.precision = self.tp / (self.tp + self.fp)
+        else:
+            self.precision = 0.0
+        if (self.precision + self.recall) != 0:
+            self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)
+        else:
+            self.f1 = 0.0
+        if (self.tp + self.fn + self.fp + self.tn) != 0:
+            self.acc = (self.tp + self.tn) / (self.tp + self.fn + self.fp + self.tn)
+        else:
+            self.acc = 0.0
+
+    def serialize(self) -> dict:
+        return {"precision": self.precision, "recall": self.recall, "f1": self.f1, "TP": self.tp,
+                "FP": self.fp, "FN": self.fn, "TN": self.tn, "acc": self.acc}
+
+class EvaluationMetrics(object):
+    
+    def __init__(self):
+        self.labels: typing.Dict[str, StatMetrics] = {}
+        self.totals = StatMetrics()
+
+    def calc_precision_recall_f1_acc(self):
+        for label in self.labels:
+            self.labels[label].calc_precision_recall_f1_acc()
+        self.totals.calc_precision_recall_f1_acc()
+
+    def serialize(self) -> dict:
+        d = {}
+        for key in self.labels:
+            d[key] = self.labels[key].serialize()
+            if key == "Totals":
+                logging.warn("There was a label called 'Totals' that is going to be overridden.")
+        d["Totals"] = self.totals.serialize()
+        return d
+
 class NerPrediction(object):
    def __init__(self, offset_start: int, offset_end: int, label: str):
        self.offset_start: int = offset_start
@@ -64,9 +122,15 @@ class Pipeline(object, metaclass=abc.ABCMeta):
    # fit(X, y)
    # internal state is changed
    @abc.abstractmethod
-    def fit(self, X, y, **params) -> dict:
+    def fit(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **params) -> dict:
        raise NotImplementedError('Must define fit to use Pipeline Base Class')

+    # evaluate(X, y, all_labels)
+    # returns stats
+    @abc.abstractmethod
+    def evaluate(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **kwargs) -> EvaluationMetrics:
+        raise NotImplementedError('Must define evaluate to use Pipeline Base Class')
+
    # predict(X)
    # returns [[[offset_start, offset_end, label], ..., ...]
    @abc.abstractmethod
@@ -84,15 +148,15 @@ class Pipeline(object, metaclass=abc.ABCMeta):
    # Given model's current state evaluate the input (id, String) pairs and return a rank ordering of lowest->highest scores for instances (will need to discuss specifics of ranking)
    # Discussing rank is now a major project - see notes
    @abc.abstractmethod
-    def next_example(self, X, Xid):
+    def next_example(self, X: typing.Iterable[str], Xid):
        raise NotImplementedError('Must define next_example to use Pipeline Base Class')

    # saves model so that it can be loaded again later
    @abc.abstractmethod
-    def save_model(self, model_name):
+    def save_model(self, model_name: str):
        raise NotImplementedError('Must define save_model to use Pipeline Base Class')

    # loads a previously saved model
    @abc.abstractmethod
-    def load_model(self, model_name):
+    def load_model(self, model_name: str):
        raise NotImplementedError('Must define load_model to use Pipeline Base Class')
--- a/pipelines/pine/pipelines/pmap_ner.py
+++ b/pipelines/pine/pipelines/pmap_ner.py
@@ -5,7 +5,7 @@ import logging
 import os
 import typing

-from .pipeline import Pipeline, DocumentPredictions, DocumentPredictionProbabilities
+from .pipeline import Pipeline, DocumentPredictions, DocumentPredictionProbabilities, EvaluationMetrics

 from overrides import overrides

@@ -20,7 +20,7 @@ class NER(Pipeline):
    __lib = ''
    pipeline = -1

-    __SUPPORTED_PIPELINES = ['spacy', 'corenlp', 'opennlp']    
+    __SUPPORTED_PIPELINES = ['spacy', 'corenlp', 'opennlp', 'simpletransformers']    

    #initializes proper nlp library pipeline based on user selection
    #there are additional args to accomodate initializing different pipelines, check individual pipeline for specifics
@@ -56,8 +56,8 @@ class NER(Pipeline):
    #internal state is changed
    #kwargs varies between pipelines, see individual pipeline for extra arguments
    @overrides
-    def fit(self, X, y, **kwargs) -> dict:
-        return self.pipeline.fit(X, y, **kwargs)
+    def fit(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **params) -> dict:
+        return self.pipeline.fit(X, y, all_labels, **params)

    @overrides
    def predict(self, X: typing.Iterable[str]) -> typing.List[DocumentPredictions]:
@@ -68,20 +68,19 @@ class NER(Pipeline):
    def predict_proba(self, X: typing.Iterable[str], **kwargs) -> typing.List[DocumentPredictionProbabilities]:
        return self.pipeline.predict_proba(X, **kwargs)

-    # evaluate(X, y, Xid)
-    # returns stats
-    def evaluate(self, X, y, Xid, **kwargs):
-        return self.pipeline.evaluate(X, y, Xid, **kwargs)
+    @overrides
+    def evaluate(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **kwargs) -> EvaluationMetrics:
+        return self.pipeline.evaluate(X, y, all_labels, **kwargs)

    #next_example(Xid)
    #Given model's current state evaluate the input (id, String) pairs and return a rank ordering of lowest->highest scores for instances (will need to discuss specifics of ranking)
    @overrides
-    def next_example(self, X, Xid):
+    def next_example(self, X: typing.Iterable[str], Xid):
        #may want to program it here instead of one level down, as the ranking function might not change with the pipeline used
        return self.pipeline.next_example(X, Xid)

    @overrides
-    def save_model(self, model_name):
+    def save_model(self, model_name: str):
        directory = os.path.dirname(model_name)
        # if directories in path dont exists create them
        if not os.path.exists(directory):
@@ -90,5 +89,5 @@ class NER(Pipeline):
        return self.pipeline.save_model(model_name)

    @overrides
-    def load_model(self, model_name):
+    def load_model(self, model_name: str):
        self.pipeline.load_model(model_name)
--- a/pipelines/pine/pipelines/shared/config.py
+++ b/pipelines/pine/pipelines/shared/config.py
@@ -95,6 +95,15 @@ class BaseConfig(object):
                framework="spacy",
                types=["fit", "predict", "status"]
            )
+        ),
+        dict(
+            name="simpletransformers",
+            version="1.0",
+            channel="service_simpletransformers",
+            service=dict(
+                framework="simpletransformers",
+                types=["fit", "predict", "status"]
+            )
        )
    ]

--- a/pipelines/pine/pipelines/simpletransformers_NER_pipeline.py
+++ b/pipelines/pine/pipelines/simpletransformers_NER_pipeline.py
@@ -0,0 +1,305 @@
+#!/usr/bin/env python3
+# coding: utf8
+# (C) 2019 The Johns Hopkins University Applied Physics Laboratory LLC.
+
+import logging
+import os
+import os.path
+from shutil import copyfile
+import uuid
+
+import typing
+
+from overrides import overrides
+
+from .pipeline import Pipeline, NerPrediction, DocumentPredictions, NerPrediction, NerPredictionProbabilities, DocumentPredictionProbabilities, EvaluationMetrics, StatMetrics
+from .shared.config import ConfigBuilder
+
+from nltk.tokenize import WhitespaceTokenizer
+from nltk.tokenize.punkt import PunktSentenceTokenizer
+import numpy as np
+import pandas as pd
+from simpletransformers.ner import NERModel, NERArgs
+
+config = ConfigBuilder.get_config()
+logger = logging.getLogger(__name__)
+transformers_logger = logging.getLogger("transformers")
+transformers_logger.setLevel(logging.WARNING)
+
+# TODO: Change the collections.json file to default the collection for simple transformers with real classifiers, etc
+
+class simpletransformers_NER(Pipeline):
+    
+    def __init__(self, tmp_dir=None):
+        self.__id = uuid.uuid4()
+        if tmp_dir != None:
+            self.__temp_dir = tmp_dir
+            #can choose to dictate where the model will store files so that it doesn't overwrite any, 
+            #otherwise it will write to a new directory within the resources folder
+        else:
+            self.__temp_dir = config.ROOT_DIR + '/tmp/simpletransformers-' + str(self.__id)
+        
+        self.__model_dir = os.path.join(self.__temp_dir, "OUTPUT_MODEL/")
+        self.__default_model_args = {
+            # TODO: Some of these should be args passed in with defaults, probably epoch size, the dirs, any others Brant might say
+            # TODO: There is a runs/ directory that is default created in the current directory where this is ran (pipelines/),
+            # there might be an option to change that, or maybe just add to gitignore?
+            "output_dir": self.__model_dir,
+            "cache_dir": os.path.join(self.__temp_dir, "CACHE_DIR/"),
+            "tensorboard_dir": os.path.join(self.__temp_dir, "TENSORBOARD/"),
+            "max_seq_length": 128,
+            "train_batch_size": 16,
+            "gradient_accumulation_steps": 1,
+            "eval_batch_size": 8,
+            "num_train_epochs": 1,
+            "weight_decay": 0,
+            "learning_rate": 4e-5,
+            "adam_epsilon": 1e-8,
+            "warmup_ratio": 0.06,
+            "warmup_steps": 20,
+            "max_grad_norm": 1.0,
+
+            "logging_steps": 50,
+            "save_steps": 500,
+
+            "overwrite_output_dir": True,
+            "reprocess_input_data": False,
+            "evaluate_during_training": False,
+        }
+        # TODO: Switch back to bioclinical bert, and also adding this as an option to change.
+        # All models we can use: https://huggingface.co/models
+        # self.__model_name = "emilyalsentzer/Bio_ClinicalBERT"
+        # This currently being used because it is faster.
+        self.__model_type = "bert"
+        self.__model_name = "google/mobilebert-uncased"
+        self.__model_use_cuda = False
+        self.__model = None
+        self.__sentence_tokenizer = PunktSentenceTokenizer()
+        self.__word_tokenizer = WhitespaceTokenizer()
+
+    # status()
+    @overrides
+    def status(self) -> dict:
+        return {
+            "default_model_args": self.__default_model_args
+        }
+
+    # fit(X, y)
+    # internal state is changed
+    @overrides
+    def fit(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **params) -> dict:
+        # setting up params
+        model_args = self.__default_model_args.copy()
+        if params is not None:
+            for key in model_args.keys():
+                if key in params:
+                    model_args[key]= params[key]
+        logger.info("Training with parameters: {}".format(model_args))
+        
+        # First, need to set up the data into a pandas dataframe and format our labels
+        df = self._format_data(X, y)
+        labels = self._format_labels(all_labels)
+        
+        # Create a new model, needs to be here for now since this is where we get labels
+        if not self.__model:
+            self.__model = NERModel(self.__model_type, self.__model_name, labels=labels,
+                     use_cuda=self.__model_use_cuda, args=model_args)
+
+        # After this, the model should be trained, and output files created
+        self.__model.train_model(df, verbose=False, silent=True,
+                                 show_running_loss=False)
+
+        return {}
+
+    @overrides
+    def evaluate(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **kwargs) -> EvaluationMetrics:
+        if not self.__model:
+            raise Exception("Can't evaluate until model has been trained or loaded")
+        
+        # First, need to set up the data into a pandas dataframe and format our labels
+        df = self._format_data(X, y)
+        
+        # No need to recreate model, as this is only run after fit().
+        
+        # Evaluate.
+        result, model_outputs, preds_list = self.__model.eval_model(
+            df, verbose=False)
+        # acc=sklearn.metrics.accuracy_score
+        logger.info("Evaluated model, result={}".format(result))
+        
+        metrics = EvaluationMetrics()
+        metrics.totals.precision = result["precision"]
+        metrics.totals.recall = result["recall"]
+        metrics.totals.f1 = result["f1_score"]
+        
+        # TODO: need acc
+        # TODO: need metrics for each label
+        
+        return metrics
+
+    # predict(X)
+    @overrides
+    def predict(self, X: typing.Iterable[str]) -> typing.List[DocumentPredictions]:
+        # First, make sure this model has been trained
+        if not self.__model:
+            return None
+
+        # Make predictions with the model
+        return_preds = []
+        for doc in X:
+            data = [s for s in self._sentencize(doc)]
+            predictions, _ = self.__model.predict([sentence for (_, _, sentence) in data])
+            return_preds.append(self._format_prediction(data, predictions))
+
+        return return_preds
+
+    # predict_proba(X)
+    # can also return scores for all labels if get_all is True
+    @overrides
+    def predict_proba(self, X: typing.Iterable[str], **kwargs) -> typing.List[DocumentPredictionProbabilities]:
+        # TODO: Need to implement this.
+        # The "raw_outputs" (second item in tuple returned from predict) is probably useful for this.
+        # Can turn predictions into probabilities for each label by running:
+        # Where the array passed in refers to each word (print raw_outputs in the expanded_ner.py file to see this)
+        # a = np.asarray([-0.2597193, 0.3929489, 0.42044127, 0.65579444, -0.075302914, 0.0072728638, 0.11236907, -0.035289638, -0.09346388, -0.25901815, -0.16599336, -0.06283752, -0.2664347])
+        # prob = softmax(a)
+        # prob is then equal to: array([0.0552652 , 0.10614558, 0.10910426, 0.13805568, 0.06645731,
+        # 0.07217802, 0.0801766 , 0.0691704 , 0.06526127, 0.05530396,
+        # 0.06069549, 0.06729091, 0.05489531]) which look like the probabilities of the labels (there are the same number of elements as labels)
+        # It probably refers to the order of the labels given in, so if the labels arg was ['B-geo', 'I-geo'...] then 
+        # B-geo is probably 0.0552652 and I-geo is probably 0.10614558... etc
+        return []
+
+    # next_example(X, Xid)
+    # Given model's current state evaluate the input (id, String) pairs and return a rank ordering of lowest->highest scores for instances (will need to discuss specifics of ranking)
+    # Discussing rank is now a major project - see notes
+    @overrides
+    def next_example(self, X: typing.Iterable[str], Xid):
+        # Don't think we needed to do anything with this.
+        return None
+
+    # saves model so that it can be loaded again later
+    @overrides
+    def save_model(self, model_name: str):
+        # Save all files from the output dir to the desired spot in order to load
+        os.mkdir(model_name)
+        # Copy from the tmp directory - but not the checkpoints
+        for filename in os.listdir(self.__model_dir):
+            if "checkpoint" not in filename:
+                copyfile(os.path.join(self.__model_dir, filename), os.path.join(model_name, filename))
+
+        return model_name
+
+    # loads a previously saved model
+    @overrides
+    def load_model(self, model_name: str):
+        # Loading from model requires creating the model from the saved directory
+        # This "model_name" is just the path, it doesn't refer to the name like before
+        self.__model = NERModel(self.__model_type, model_name,
+                     use_cuda=self.__model_use_cuda, args=self.__default_model_args)
+
+    ###############################
+    # Helper Methods
+    ###############################
+
+    def _get_word_label(self, start_index, end_index, label_list):
+        # Takes in the indices of a word and label list to return a related tag  (if possible)
+        # This will account for the I-<label> or B-<label> that simpletransformers expects
+        for label_group in label_list:
+            # This works because the word either begins a multi-word label or the label only covers a single word
+            if label_group[0] == start_index:
+                return "B-" + label_group[2]
+            # This is at least the second word in a multi-word label
+            # <= because == works on the last word, > is for any word that appears BETWEEN the first and last words
+            elif label_group[0] < start_index and label_group[1] >= end_index:
+                return "I-" + label_group[2]
+            # Assuming y is always sorted, this ends the loop if there is no label at this index early to save time
+            elif end_index < label_group[0]:
+                break
+
+        # If it got here, the label was not found
+        return "O"
+
+    def _sentencize(self, text: str) -> typing.Generator[int, int, str]:
+        for (sentence_start, sentence_end) in self.__sentence_tokenizer.span_tokenize(text):
+            yield (sentence_start, sentence_end, text[sentence_start:sentence_end])
+
+    # Takes input data and formats it to be easier to use in the spacy pipeline
+    # ASSUMES DATA FOLLOWS FORMAT X = [string], y = [[(start offset, stop offset, label), ()], ... []]
+    # Simpletransformers needs a pandas dataframe with columns: sentence_id, words, labels
+    def _format_data(self, X: typing.Iterable[str], y) -> pd.DataFrame:
+        # TODO: Need to check to make sure no sentence has over max_seq_length words
+        df = pd.DataFrame(columns=["sentence_id","words","labels"])
+        curr_sentence_id = 0
+        for (doc_txt, labels) in zip(X, y):
+            for (sentence_start, _, sentence) in self._sentencize(doc_txt):
+                for (sentence_word_start, sentence_word_end) in self.__word_tokenizer.span_tokenize(sentence):
+                    word_start = sentence_start + sentence_word_start
+                    word_end = sentence_start + sentence_word_end
+                    word = doc_txt[word_start:word_end]
+                    curr_label = self._get_word_label(word_start, word_end, labels)
+                    df = df.append({
+                        "sentence_id": curr_sentence_id,
+                        "words": word,
+                        "labels": curr_label
+                    }, ignore_index=True)
+                curr_sentence_id += 1
+
+        return df
+
+    # Takes the prediction output of simpletransformers ([[{'U.N.': 'B-per'}], [{'relief': 'I-gpe'}], ...]) 
+    # and turns it into the form PINE desires, [[[offset_start, offset_end, label], ..., ...]
+    def _format_prediction(self, data, predictions) -> DocumentPredictions:
+        ner: typing.List[NerPrediction] = []
+        for (index, sentence_predictions) in enumerate(predictions):
+            sentence_start, _, sentence = data[index]
+            current_label = None
+            current_label_start = None
+            current_label_end = None
+            word_index = 0
+            sentence_ner: typing.List[NerPrediction] = []
+            for pred_dict in sentence_predictions:
+                for (word, label) in pred_dict.items():
+                    word_index = sentence.find(word, word_index)
+                    if label == "O":
+                        if current_label != None:
+                            sentence_ner.append(NerPrediction(current_label_start, current_label_end, current_label))
+                            current_label = current_label_start = current_label_end = None
+                        continue
+                    
+                    is_b = label.startswith("B-")
+                    is_i = label.startswith("I-")
+                    if is_b or is_i:
+                        label = label[2:]
+                    
+                    # if we're at the beginning, we always add the old tag
+                    # if we're at an inner and it's different from the current label, add the old tag
+                    if current_label != None and (is_b or (is_i and label != current_label)):
+                        sentence_ner.append(NerPrediction(current_label_start, current_label_end, current_label))
+                        current_label = current_label_start = current_label_end = None
+                    
+                    if current_label != None: # continuing the label
+                        current_label_end = sentence_start + word_index + len(word)
+                    else: # new label
+                        current_label = label
+                        current_label_start = sentence_start + word_index
+                        current_label_end = sentence_start + word_index + len(word)
+
+            # the last label
+            if current_label != None:
+                sentence_ner.append(NerPrediction(current_label_start, current_label_end, current_label))
+            ner += sentence_ner
+
+        return DocumentPredictions(ner, [])
+
+    # Get a list of all labels in a set of data
+    def _format_labels(self, all_labels: typing.List[str]):
+        # Have to add a B-<label> and I-<label> for each label.
+        ret_labels = []
+        for label in all_labels:
+            ret_labels.append("B-" + str(label))
+            ret_labels.append("I-" + str(label))
+
+        # Add the other tag
+        ret_labels.append("O")
+        return ret_labels
--- a/pipelines/pine/pipelines/spacy_NER_pipeline.py
+++ b/pipelines/pine/pipelines/spacy_NER_pipeline.py
@@ -19,7 +19,7 @@ from spacy.scorer import Scorer
 from spacy.gold import GoldParse
 from overrides import overrides

-from .pipeline import Pipeline, NerPrediction, DocumentPredictions, NerPredictionProbabilities, DocumentPredictionProbabilities
+from .pipeline import Pipeline, NerPrediction, DocumentPredictions, NerPredictionProbabilities, DocumentPredictionProbabilities, EvaluationMetrics, StatMetrics

 logger = logging.getLogger(__name__)

@@ -74,7 +74,7 @@ class spacy_NER(Pipeline):
        }

    @overrides
-    def fit(self, X, y, **params) -> dict:
+    def fit(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **params) -> dict:
        #setting up params
        default_params = self.__default_fit_params.copy()
        if params is not None:
@@ -118,25 +118,24 @@ class spacy_NER(Pipeline):
            "losses": all_losses
        }

-    def evaluate(self, X, y, Xid):
+    @overrides
+    def evaluate(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **kwargs) -> EvaluationMetrics:
        train_data = self.format_data(X, y)
        all_labels = set()
-        metrics = dict()
+        metrics = EvaluationMetrics()
        # get all labels
        for text, annot in train_data:
            for ent in annot['entities']:
                all_labels.add(ent[2])
        all_labels = list(all_labels)
-        stats = {}

        for text, annots in train_data:
            pred_doc = self.__nlp(text)
            gold_doc = self.__nlp.make_doc(text)
            gold_labels = []

-            stats['Totals'] = [0,0,0,0]
            for label in all_labels:
-                stats[label] = [0,0,0,0]
+                metrics.labels[label] = StatMetrics()

            for token in pred_doc:
                gold_labels.append(set())
@@ -149,11 +148,9 @@ class spacy_NER(Pipeline):

                goldParse = GoldParse(gold_doc, entities=annotations_for_label)
                for index, annotation in enumerate(goldParse.ner):
-
                    if annotation != 'O':
                        gold_labels[index].add(annotation[2:])

-
            for index, pred_token in enumerate(pred_doc):
                pred_label = pred_token.ent_type_
                if pred_label != '':
@@ -161,56 +158,33 @@ class spacy_NER(Pipeline):
                        if label == pred_label:
                            if label in gold_labels[index]:
                                #TP
-                                stats[label][0] += 1
-                                stats['Totals'][0] += 1
+                                metrics.labels[label].tp += 1
+                                metrics.totals.tp += 1
                            else:
                                #FP
-                                stats[label][1] += 1
-                                stats['Totals'][1] += 1
+                                metrics.labels[label].fp += 1
+                                metrics.totals.fp += 1

                        else:
                            #All other labels are true negative because the model can only predict one label per token
                            #TN
-
-                            stats[label][3] += 1
-                            stats['Totals'][3] += 1
-
+                            metrics.labels[label].tn += 1
+                            metrics.totals.tn += 1

                else:
                    for label in all_labels:
                        if label in gold_labels[index]:
                            #FN
-                            stats[label][2] += 1
-                            stats['Totals'][2] += 1
+                            metrics.labels[label].fn += 1
+                            metrics.totals.fn += 1

                        else:
                            #TN
-                            stats[label][3] += 1
-                            stats['Totals'][3] += 1
+                            metrics.labels[label].tn += 1
+                            metrics.totals.tn += 1

-        for key in stats:
-            TP = stats[key][0]
-            FP = stats[key][1]
-            FN = stats[key][2]
-            TN = stats[key][3]
-            if (TP + FN) != 0:
-                recall = TP / (TP + FN)
-            else:
-                recall = 1.0
-            if (TP + FP) != 0:
-                precision = TP / (TP + FP)
-            else:
-                precision = 0.0
-            if (precision + recall) != 0:
-                f1 = 2 * (precision * recall) / (precision + recall)
-            else:
-                f1 = 0
-            if (TP + FN + FP + TN) != 0:
-                acc = (TP + TN) / (TP + FN + FP + TN)
-            else:
-                acc = 0
-            metrics[key] = {'precision': precision, 'recall': recall, 'f1': f1, 'TP': TP, 'FP': FP, 'FN': FN, "TN": TN,
-                          "acc": acc}
+        metrics.calc_precision_recall_f1_acc()
+        
        return metrics


@@ -356,14 +330,14 @@ class spacy_NER(Pipeline):

    @overrides
    # TODO
-    def next_example(self, X, Xid):
+    def next_example(self, X: typing.Iterable[str], Xid):
        return

    ## EXTRA METHODS TO HELP WITH THE SPACY PIPELINE ##

    # Takes input data and formats it to be easier to use in the spacy pipeline
    # ASSUMES DATA FOLLOWS FORMAT X = [string], y = [[(start offset, stop offset, label), ()], ... []]
-    def format_data(self, X, y):
+    def format_data(self, X: typing.Iterable[str], y):
        out = []
        for i, text in enumerate(X):
            out.append((text, {'entities': [(labels) for labels in y[i]]}))
@@ -374,11 +348,11 @@ class spacy_NER(Pipeline):
        self.__ner.add_label(entity)

    @overrides
-    def save_model(self, model_name):
+    def save_model(self, model_name: str):
        self.__nlp.to_disk(model_name)
        logger.info('Saved model to ' + model_name)
        return model_name

    @overrides
-    def load_model(self, model_name):
+    def load_model(self, model_name: str):
        self._load_model(model_path=model_name)
--- a/run_dev_stack.py
+++ b/run_dev_stack.py
@@ -161,6 +161,9 @@ def main():
        if not os.path.isfile(redis_start):
            lock_print("Couldn't find redis start script: {}.".format(redis_start))
            return 1
+
+        pipeline_dir = os.path.join(DIR, "pipelines")
+        pipeline_start = os.path.join(pipeline_dir, "dev_run.sh")
    
        backend_dir = os.path.join(DIR, "backend")
        backend_start = os.path.join(backend_dir, "dev_run.sh")
@@ -179,24 +182,21 @@ def main():
        if docker:
            frontend_annotation_start = [frontend_annotation_start, "--", "--host", "0.0.0.0"]

-        pipeline_dir = os.path.join(DIR, "pipelines")
-        pipeline_start = os.path.join(pipeline_dir, "dev_run.sh")
-
    eve_process = start_eve_process(eve_dir, eve_start)
    if not eve_only:
        redis_process = start_redis_process(redis_dir, redis_start)
+        pipeline_process = start_pipeline(pipeline_dir, pipeline_start)
        backend_process = start_backend_process(backend_dir, backend_start)
    if not eve_only and not backend_only:
        frontend_annotation_process = start_frontend_annotation_process(frontend_annotation_dir, frontend_annotation_start)
-        pipeline_process = start_pipeline(pipeline_dir, pipeline_start)

    def signal_handler(sig, frame):
        lock_print("")
        if not eve_only and not backend_only:
-            stop_pipeline(pipeline_process)
            stop_frontend_annotation_process(frontend_annotation_process)
        if not eve_only:
            stop_backend_process(backend_process)
+            stop_pipeline(pipeline_process)
            stop_redis_process(redis_process)
        stop_eve_process(eve_process)
        lock_print("")
--- a/test/data/collections.json
+++ b/test/data/collections.json
@@ -39,7 +39,7 @@
            "labels": ["geo", "gpe", "per", "org", "tim", "art"],
            "metadata": {
                "title": "Small Collection",
-                "description": "This is a small collection"
+                "description": "This is a small collection using spaCy pipeline"
            },
            "archived": false,
            "configuration": {
@@ -156,5 +156,35 @@
                 "text_column": 0
            }
        }
+    },
+    {
+        "collection": {
+            "creator_id": "ada",
+            "annotators": ["ada"],
+            "viewers": ["ada"],
+            "labels": ["geo", "gpe", "per", "org", "tim", "art"],
+            "metadata": {
+                "title": "Small Collection Simpletransformers",
+                "description": "This is a small collection using Simpletransformers pipeline"
+            },
+            "archived": false,
+            "configuration": {
+                "allow_overlapping_ner_annotations": false
+            }
+        }, "classifier": {
+            "pipelineId": "5babb6ee4eb7dd2c39b96720",
+            "overlap": 0,
+            "train_every": 5,
+            "classifierParameters": {
+                "cutoff": 1,
+                "iterations": 5
+            }
+        }, "documents": {
+            "ner_annotations": {
+                "csv_file": "./ner_dataset.csv",
+                "sentences_per_doc": 5
+            },
+            "num_docs": 5
+        }
    }
 ]
--- a/test/data/pipelines.json
+++ b/test/data/pipelines.json
@@ -40,5 +40,15 @@
            "use_type_seqs2": [true, false],
            "use_type_y_sequences": [true, false]
        }
+    },
+    {
+        "_id": "5babb6ee4eb7dd2c39b96720",
+        "title": "SimpleTransformers - Bio-ClinicalBERT",
+        "description": "SimpleTransformers models.",
+        "name": "simpletransformers",
+        "parameters": {
+            "training_batch_size": "integer",
+            "num_train_epochs": "integer"
+        }
    }
 ]
--- a/test/tests/pytest/04_pipelines_test.py
+++ b/test/tests/pytest/04_pipelines_test.py
@@ -239,7 +239,18 @@ def test_train_and_predict_opennlp():
        [972, 976, 'gpe'], [1025, 1029, 'gpe'], [1089, 1096, 'geo'], [1113, 1120, 'gpe'],
        [1200, 1209, 'tim'], [1221, 1225, 'org']]

-def test_sync_train():
+def test_train_and_predict_simpletransformers():
+    prediction = _test_train_and_predict("Small Collection Simpletransformers")
+    assert len(prediction["doc"]) == 0
+    preds = prediction["ner"]
+    # unfortunately the simpletransformers predictions are not the same across runs
+    # and there don't seem to be guaranteed common tokens
+    # so just make sure any predictions have proper labels...
+    common_labels = {'gpe', 'org', 'geo', 'tim', 'per'}
+    for pred in preds:
+        assert pred[2] in common_labels
+
+def test_sync_train():  
    client = common.login_with_test_user(common.client())
    
    collection = common.get_collection(client, "Small Collection OpenNLP")