Merge pull request #61 from jhuapl-lglenden/develop-updates

Updates from latest development branch.
This commit is contained in:
Laura Glendenning
2022-02-04 18:07:08 -05:00
committed by GitHub
35 changed files with 4033 additions and 1493 deletions

1
.env
View File

@@ -10,6 +10,7 @@ EVE_DB_VOLUME=eve_db
OPENNLP_ID=5babb6ee4eb7dd2c39b9671c
CORENLP_ID=5babb6ee4eb7dd2c39b9671d
DOCUMENT_CLASSIFIER_ID=5babb6ee4eb7dd2c39b9671b
SIMPLETRANSFORMERS_ID=5babb6ee4eb7dd2c39b96720
EXPOSED_SERVER_TYPE=https
EXPOSED_SERVER_NAME=localhost

View File

@@ -25,6 +25,7 @@ scipy = "~=1.7.1"
tabulate = "~=0.8.9"
multiprocessing-logging = "~=0.3.1"
flask-httpauth = "~=4.4.0"
lxml = "~=4.6.3"
[dev-packages]

322
backend/Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "3a30a022ccd4fbe028c8cf2c8f741b9c7f7fa72e039dba391da62a20e58c5273"
"sha256": "382dbc37f5349e1a9d22b266891cad743be81ff76fe395c112c157b6a110ed62"
},
"pipfile-spec": 6,
"requires": {
@@ -55,10 +55,10 @@
},
"certifi": {
"hashes": [
"sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee",
"sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8"
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
"sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
],
"version": "==2021.5.30"
"version": "==2021.10.8"
},
"cffi": {
"hashes": [
@@ -112,39 +112,45 @@
},
"charset-normalizer": {
"hashes": [
"sha256:0c8911edd15d19223366a194a513099a302055a962bca2cec0f54b8b63175d8b",
"sha256:f23667ebe1084be45f6ae0538e4a5a865206544097e4e8bbcacf42cd02a348f3"
"sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0",
"sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"
],
"markers": "python_version >= '3'",
"version": "==2.0.4"
"version": "==2.0.7"
},
"click": {
"hashes": [
"sha256:8c04c11192119b1ef78ea049e0a6f0463e4c48ef00a30160c704337586f3ad7a",
"sha256:fba402a4a47334742d782209a7c79bc448911afe1149d07bdabdf480b3e2f4b6"
"sha256:353f466495adaeb40b6b5f592f9f91cb22372351c84caeb068132442a4518ef3",
"sha256:410e932b050f5eed773c4cda94de75971c89cdb3155a72a0831139a79e5ecb5b"
],
"markers": "python_version >= '3.6'",
"version": "==8.0.1"
"version": "==8.0.3"
},
"cryptography": {
"hashes": [
"sha256:0f1212a66329c80d68aeeb39b8a16d54ef57071bf22ff4e521657b27372e327d",
"sha256:1e056c28420c072c5e3cb36e2b23ee55e260cb04eee08f702e0edfec3fb51959",
"sha256:240f5c21aef0b73f40bb9f78d2caff73186700bf1bc6b94285699aff98cc16c6",
"sha256:26965837447f9c82f1855e0bc8bc4fb910240b6e0d16a664bb722df3b5b06873",
"sha256:37340614f8a5d2fb9aeea67fd159bfe4f5f4ed535b1090ce8ec428b2f15a11f2",
"sha256:3d10de8116d25649631977cb37da6cbdd2d6fa0e0281d014a5b7d337255ca713",
"sha256:3d8427734c781ea5f1b41d6589c293089704d4759e34597dce91014ac125aad1",
"sha256:7ec5d3b029f5fa2b179325908b9cd93db28ab7b85bb6c1db56b10e0b54235177",
"sha256:8e56e16617872b0957d1c9742a3f94b43533447fd78321514abbe7db216aa250",
"sha256:b01fd6f2737816cb1e08ed4807ae194404790eac7ad030b34f2ce72b332f5586",
"sha256:bf40af59ca2465b24e54f671b2de2c59257ddc4f7e5706dbd6930e26823668d3",
"sha256:de4e5f7f68220d92b7637fc99847475b59154b7a1b3868fb7385337af54ac9ca",
"sha256:eb8cc2afe8b05acbd84a43905832ec78e7b3873fb124ca190f574dca7389a87d",
"sha256:ee77aa129f481be46f8d92a1a7db57269a2f23052d5f2433b4621bb457081cc9"
"sha256:07bb7fbfb5de0980590ddfc7f13081520def06dc9ed214000ad4372fb4e3c7f6",
"sha256:18d90f4711bf63e2fb21e8c8e51ed8189438e6b35a6d996201ebd98a26abbbe6",
"sha256:1ed82abf16df40a60942a8c211251ae72858b25b7421ce2497c2eb7a1cee817c",
"sha256:22a38e96118a4ce3b97509443feace1d1011d0571fae81fc3ad35f25ba3ea999",
"sha256:2d69645f535f4b2c722cfb07a8eab916265545b3475fdb34e0be2f4ee8b0b15e",
"sha256:4a2d0e0acc20ede0f06ef7aa58546eee96d2592c00f450c9acb89c5879b61992",
"sha256:54b2605e5475944e2213258e0ab8696f4f357a31371e538ef21e8d61c843c28d",
"sha256:7075b304cd567694dc692ffc9747f3e9cb393cc4aa4fb7b9f3abd6f5c4e43588",
"sha256:7b7ceeff114c31f285528ba8b390d3e9cfa2da17b56f11d366769a807f17cbaa",
"sha256:7eba2cebca600a7806b893cb1d541a6e910afa87e97acf2021a22b32da1df52d",
"sha256:928185a6d1ccdb816e883f56ebe92e975a262d31cc536429041921f8cb5a62fd",
"sha256:9933f28f70d0517686bd7de36166dda42094eac49415459d9bdf5e7df3e0086d",
"sha256:a688ebcd08250eab5bb5bca318cc05a8c66de5e4171a65ca51db6bd753ff8953",
"sha256:abb5a361d2585bb95012a19ed9b2c8f412c5d723a9836418fab7aaa0243e67d2",
"sha256:c10c797ac89c746e488d2ee92bd4abd593615694ee17b2500578b63cad6b93a8",
"sha256:ced40344e811d6abba00295ced98c01aecf0c2de39481792d87af4fa58b7b4d6",
"sha256:d57e0cdc1b44b6cdf8af1d01807db06886f10177469312fbde8f44ccbb284bc9",
"sha256:d99915d6ab265c22873f1b4d6ea5ef462ef797b4140be4c9d8b179915e0985c6",
"sha256:eb80e8a1f91e4b7ef8b33041591e6d89b2b8e122d787e87eeb2b08da71bb16ad",
"sha256:ebeddd119f526bcf323a89f853afb12e225902a24d29b55fe18dd6fcb2838a76"
],
"markers": "python_version >= '3.6'",
"version": "==3.4.7"
"version": "==35.0.0"
},
"cycler": {
"hashes": [
@@ -155,11 +161,11 @@
},
"flask": {
"hashes": [
"sha256:1c4c257b1892aec1398784c63791cbaa43062f1f7aeb555c4da961b20ee68f55",
"sha256:a6209ca15eb63fc9385f38e452704113d679511d9574d09b2cf9183ae7d20dc9"
"sha256:7b2fb8e934ddd50731893bdcdb00fc8c0315916f9fcd50d22c7cc1a95ab634e2",
"sha256:cb90f62f1d8e4dc4621f52106613488b5ba826b2e1e10a33eac92f723093ab6a"
],
"index": "pypi",
"version": "==2.0.1"
"version": "==2.0.2"
},
"flask-cors": {
"hashes": [
@@ -195,49 +201,115 @@
},
"jinja2": {
"hashes": [
"sha256:1f06f2da51e7b56b8f238affdd6b4e2c61e39598a378cc49345bc1bd42a978a4",
"sha256:703f484b47a6af502e743c9122595cc812b0271f661722403114f71a79d0f5a4"
"sha256:827a0e32839ab1600d4eb1c4c33ec5a8edfbc5cb42dafa13b81f182f97784b45",
"sha256:8569982d3f0889eed11dd620c706d39b60c36d6d25843961f33f77fb6bc6b20c"
],
"markers": "python_version >= '3.6'",
"version": "==3.0.1"
"version": "==3.0.2"
},
"kiwisolver": {
"hashes": [
"sha256:0cd53f403202159b44528498de18f9285b04482bab2a6fc3f5dd8dbb9352e30d",
"sha256:1e1bc12fb773a7b2ffdeb8380609f4f8064777877b2225dec3da711b421fda31",
"sha256:225e2e18f271e0ed8157d7f4518ffbf99b9450fca398d561eb5c4a87d0986dd9",
"sha256:232c9e11fd7ac3a470d65cd67e4359eee155ec57e822e5220322d7b2ac84fbf0",
"sha256:31dfd2ac56edc0ff9ac295193eeaea1c0c923c0355bf948fbd99ed6018010b72",
"sha256:33449715e0101e4d34f64990352bce4095c8bf13bed1b390773fc0a7295967b3",
"sha256:401a2e9afa8588589775fe34fc22d918ae839aaaf0c0e96441c0fdbce6d8ebe6",
"sha256:44a62e24d9b01ba94ae7a4a6c3fb215dc4af1dde817e7498d901e229aaf50e4e",
"sha256:50af681a36b2a1dee1d3c169ade9fdc59207d3c31e522519181e12f1b3ba7000",
"sha256:563c649cfdef27d081c84e72a03b48ea9408c16657500c312575ae9d9f7bc1c3",
"sha256:5989db3b3b34b76c09253deeaf7fbc2707616f130e166996606c284395da3f18",
"sha256:5a7a7dbff17e66fac9142ae2ecafb719393aaee6a3768c9de2fd425c63b53e21",
"sha256:5c3e6455341008a054cccee8c5d24481bcfe1acdbc9add30aa95798e95c65621",
"sha256:5f6ccd3dd0b9739edcf407514016108e2280769c73a85b9e59aa390046dbf08b",
"sha256:72c99e39d005b793fb7d3d4e660aed6b6281b502e8c1eaf8ee8346023c8e03bc",
"sha256:78751b33595f7f9511952e7e60ce858c6d64db2e062afb325985ddbd34b5c131",
"sha256:834ee27348c4aefc20b479335fd422a2c69db55f7d9ab61721ac8cd83eb78882",
"sha256:8be8d84b7d4f2ba4ffff3665bcd0211318aa632395a1a41553250484a871d454",
"sha256:950a199911a8d94683a6b10321f9345d5a3a8433ec58b217ace979e18f16e248",
"sha256:a357fd4f15ee49b4a98b44ec23a34a95f1e00292a139d6015c11f55774ef10de",
"sha256:a53d27d0c2a0ebd07e395e56a1fbdf75ffedc4a05943daf472af163413ce9598",
"sha256:acef3d59d47dd85ecf909c359d0fd2c81ed33bdff70216d3956b463e12c38a54",
"sha256:b38694dcdac990a743aa654037ff1188c7a9801ac3ccc548d3341014bc5ca278",
"sha256:b9edd0110a77fc321ab090aaa1cfcaba1d8499850a12848b81be2222eab648f6",
"sha256:c08e95114951dc2090c4a630c2385bef681cacf12636fb0241accdc6b303fd81",
"sha256:c5518d51a0735b1e6cee1fdce66359f8d2b59c3ca85dc2b0813a8aa86818a030",
"sha256:c8fd0f1ae9d92b42854b2979024d7597685ce4ada367172ed7c09edf2cef9cb8",
"sha256:ca3820eb7f7faf7f0aa88de0e54681bddcb46e485beb844fcecbcd1c8bd01689",
"sha256:cf8b574c7b9aa060c62116d4181f3a1a4e821b2ec5cbfe3775809474113748d4",
"sha256:d3155d828dec1d43283bd24d3d3e0d9c7c350cdfcc0bd06c0ad1209c1bbc36d0",
"sha256:f8d6f8db88049a699817fd9178782867bf22283e3813064302ac59f61d95be05",
"sha256:fd34fbbfbc40628200730bc1febe30631347103fc8d3d4fa012c21ab9c11eca9"
"sha256:0007840186bacfaa0aba4466d5890334ea5938e0bb7e28078a0eb0e63b5b59d5",
"sha256:19554bd8d54cf41139f376753af1a644b63c9ca93f8f72009d50a2080f870f77",
"sha256:1d45d1c74f88b9f41062716c727f78f2a59a5476ecbe74956fafb423c5c87a76",
"sha256:1d819553730d3c2724582124aee8a03c846ec4362ded1034c16fb3ef309264e6",
"sha256:2210f28778c7d2ee13f3c2a20a3a22db889e75f4ec13a21072eabb5693801e84",
"sha256:22521219ca739654a296eea6d4367703558fba16f98688bd8ce65abff36eaa84",
"sha256:25405f88a37c5f5bcba01c6e350086d65e7465fd1caaf986333d2a045045a223",
"sha256:2b65bd35f3e06a47b5c30ea99e0c2b88f72c6476eedaf8cfbc8e66adb5479dcf",
"sha256:2ddb500a2808c100e72c075cbb00bf32e62763c82b6a882d403f01a119e3f402",
"sha256:2f8f6c8f4f1cff93ca5058d6ec5f0efda922ecb3f4c5fb76181f327decff98b8",
"sha256:30fa008c172355c7768159983a7270cb23838c4d7db73d6c0f6b60dde0d432c6",
"sha256:3dbb3cea20b4af4f49f84cffaf45dd5f88e8594d18568e0225e6ad9dec0e7967",
"sha256:4116ba9a58109ed5e4cb315bdcbff9838f3159d099ba5259c7c7fb77f8537492",
"sha256:44e6adf67577dbdfa2d9f06db9fbc5639afefdb5bf2b4dfec25c3a7fbc619536",
"sha256:5326ddfacbe51abf9469fe668944bc2e399181a2158cb5d45e1d40856b2a0589",
"sha256:70adc3658138bc77a36ce769f5f183169bc0a2906a4f61f09673f7181255ac9b",
"sha256:72be6ebb4e92520b9726d7146bc9c9b277513a57a38efcf66db0620aec0097e0",
"sha256:7843b1624d6ccca403a610d1277f7c28ad184c5aa88a1750c1a999754e65b439",
"sha256:7ba5a1041480c6e0a8b11a9544d53562abc2d19220bfa14133e0cdd9967e97af",
"sha256:80efd202108c3a4150e042b269f7c78643420cc232a0a771743bb96b742f838f",
"sha256:82f49c5a79d3839bc8f38cb5f4bfc87e15f04cbafa5fbd12fb32c941cb529cfb",
"sha256:83d2c9db5dfc537d0171e32de160461230eb14663299b7e6d18ca6dca21e4977",
"sha256:8d93a1095f83e908fc253f2fb569c2711414c0bfd451cab580466465b235b470",
"sha256:8dc3d842fa41a33fe83d9f5c66c0cc1f28756530cd89944b63b072281e852031",
"sha256:9661a04ca3c950a8ac8c47f53cbc0b530bce1b52f516a1e87b7736fec24bfff0",
"sha256:a498bcd005e8a3fedd0022bb30ee0ad92728154a8798b703f394484452550507",
"sha256:a7a4cf5bbdc861987a7745aed7a536c6405256853c94abc9f3287c3fa401b174",
"sha256:b5074fb09429f2b7bc82b6fb4be8645dcbac14e592128beeff5461dcde0af09f",
"sha256:b6a5431940f28b6de123de42f0eb47b84a073ee3c3345dc109ad550a3307dd28",
"sha256:ba677bcaff9429fd1bf01648ad0901cea56c0d068df383d5f5856d88221fe75b",
"sha256:bcadb05c3d4794eb9eee1dddf1c24215c92fb7b55a80beae7a60530a91060560",
"sha256:bf7eb45d14fc036514c09554bf983f2a72323254912ed0c3c8e697b62c4c158f",
"sha256:c358721aebd40c243894298f685a19eb0491a5c3e0b923b9f887ef1193ddf829",
"sha256:c4550a359c5157aaf8507e6820d98682872b9100ce7607f8aa070b4b8af6c298",
"sha256:c6572c2dab23c86a14e82c245473d45b4c515314f1f859e92608dcafbd2f19b8",
"sha256:cba430db673c29376135e695c6e2501c44c256a81495da849e85d1793ee975ad",
"sha256:dedc71c8eb9c5096037766390172c34fb86ef048b8e8958b4e484b9e505d66bc",
"sha256:e6f5eb2f53fac7d408a45fbcdeda7224b1cfff64919d0f95473420a931347ae9",
"sha256:ec2eba188c1906b05b9b49ae55aae4efd8150c61ba450e6721f64620c50b59eb",
"sha256:ee040a7de8d295dbd261ef2d6d3192f13e2b08ec4a954de34a6fb8ff6422e24c",
"sha256:eedd3b59190885d1ebdf6c5e0ca56828beb1949b4dfe6e5d0256a461429ac386",
"sha256:f441422bb313ab25de7b3dbfd388e790eceb76ce01a18199ec4944b369017009",
"sha256:f8eb7b6716f5b50e9c06207a14172cf2de201e41912ebe732846c02c830455b9",
"sha256:fc4453705b81d03568d5b808ad8f09c77c47534f6ac2e72e733f9ca4714aa75c"
],
"markers": "python_version >= '3.6'",
"version": "==1.3.1"
"markers": "python_version >= '3.7'",
"version": "==1.3.2"
},
"lxml": {
"hashes": [
"sha256:079f3ae844f38982d156efce585bc540c16a926d4436712cf4baee0cce487a3d",
"sha256:0fbcf5565ac01dff87cbfc0ff323515c823081c5777a9fc7703ff58388c258c3",
"sha256:122fba10466c7bd4178b07dba427aa516286b846b2cbd6f6169141917283aae2",
"sha256:1b38116b6e628118dea5b2186ee6820ab138dbb1e24a13e478490c7db2f326ae",
"sha256:1b7584d421d254ab86d4f0b13ec662a9014397678a7c4265a02a6d7c2b18a75f",
"sha256:26e761ab5b07adf5f555ee82fb4bfc35bf93750499c6c7614bd64d12aaa67927",
"sha256:289e9ca1a9287f08daaf796d96e06cb2bc2958891d7911ac7cae1c5f9e1e0ee3",
"sha256:2a9d50e69aac3ebee695424f7dbd7b8c6d6eb7de2a2eb6b0f6c7db6aa41e02b7",
"sha256:3082c518be8e97324390614dacd041bb1358c882d77108ca1957ba47738d9d59",
"sha256:33bb934a044cf32157c12bfcfbb6649807da20aa92c062ef51903415c704704f",
"sha256:3439c71103ef0e904ea0a1901611863e51f50b5cd5e8654a151740fde5e1cade",
"sha256:36108c73739985979bf302006527cf8a20515ce444ba916281d1c43938b8bb96",
"sha256:39b78571b3b30645ac77b95f7c69d1bffc4cf8c3b157c435a34da72e78c82468",
"sha256:4289728b5e2000a4ad4ab8da6e1db2e093c63c08bdc0414799ee776a3f78da4b",
"sha256:4bff24dfeea62f2e56f5bab929b4428ae6caba2d1eea0c2d6eb618e30a71e6d4",
"sha256:4c61b3a0db43a1607d6264166b230438f85bfed02e8cff20c22e564d0faff354",
"sha256:542d454665a3e277f76954418124d67516c5f88e51a900365ed54a9806122b83",
"sha256:5a0a14e264069c03e46f926be0d8919f4105c1623d620e7ec0e612a2e9bf1c04",
"sha256:5c8c163396cc0df3fd151b927e74f6e4acd67160d6c33304e805b84293351d16",
"sha256:64812391546a18896adaa86c77c59a4998f33c24788cadc35789e55b727a37f4",
"sha256:66e575c62792c3f9ca47cb8b6fab9e35bab91360c783d1606f758761810c9791",
"sha256:6f12e1427285008fd32a6025e38e977d44d6382cf28e7201ed10d6c1698d2a9a",
"sha256:74f7d8d439b18fa4c385f3f5dfd11144bb87c1da034a466c5b5577d23a1d9b51",
"sha256:7610b8c31688f0b1be0ef882889817939490a36d0ee880ea562a4e1399c447a1",
"sha256:76fa7b1362d19f8fbd3e75fe2fb7c79359b0af8747e6f7141c338f0bee2f871a",
"sha256:7728e05c35412ba36d3e9795ae8995e3c86958179c9770e65558ec3fdfd3724f",
"sha256:8157dadbb09a34a6bd95a50690595e1fa0af1a99445e2744110e3dca7831c4ee",
"sha256:820628b7b3135403540202e60551e741f9b6d3304371712521be939470b454ec",
"sha256:884ab9b29feaca361f7f88d811b1eea9bfca36cf3da27768d28ad45c3ee6f969",
"sha256:89b8b22a5ff72d89d48d0e62abb14340d9e99fd637d046c27b8b257a01ffbe28",
"sha256:92e821e43ad382332eade6812e298dc9701c75fe289f2a2d39c7960b43d1e92a",
"sha256:b007cbb845b28db4fb8b6a5cdcbf65bacb16a8bd328b53cbc0698688a68e1caa",
"sha256:bc4313cbeb0e7a416a488d72f9680fffffc645f8a838bd2193809881c67dd106",
"sha256:bccbfc27563652de7dc9bdc595cb25e90b59c5f8e23e806ed0fd623755b6565d",
"sha256:c1a40c06fd5ba37ad39caa0b3144eb3772e813b5fb5b084198a985431c2f1e8d",
"sha256:c47ff7e0a36d4efac9fd692cfa33fbd0636674c102e9e8d9b26e1b93a94e7617",
"sha256:c4f05c5a7c49d2fb70223d0d5bcfbe474cf928310ac9fa6a7c6dddc831d0b1d4",
"sha256:cdaf11d2bd275bf391b5308f86731e5194a21af45fbaaaf1d9e8147b9160ea92",
"sha256:ce256aaa50f6cc9a649c51be3cd4ff142d67295bfc4f490c9134d0f9f6d58ef0",
"sha256:d2e35d7bf1c1ac8c538f88d26b396e73dd81440d59c1ef8522e1ea77b345ede4",
"sha256:d916d31fd85b2f78c76400d625076d9124de3e4bda8b016d25a050cc7d603f24",
"sha256:df7c53783a46febb0e70f6b05df2ba104610f2fb0d27023409734a3ecbb78fb2",
"sha256:e1cbd3f19a61e27e011e02f9600837b921ac661f0c40560eefb366e4e4fb275e",
"sha256:efac139c3f0bf4f0939f9375af4b02c5ad83a622de52d6dfa8e438e8e01d0eb0",
"sha256:efd7a09678fd8b53117f6bae4fa3825e0a22b03ef0a932e070c0bdbb3a35e654",
"sha256:f2380a6376dfa090227b663f9678150ef27543483055cc327555fb592c5967e2",
"sha256:f8380c03e45cf09f8557bdaa41e1fa7c81f3ae22828e1db470ab2a6c96d8bc23",
"sha256:f90ba11136bfdd25cae3951af8da2e95121c9b9b93727b1b896e3fa105b2f586"
],
"index": "pypi",
"version": "==4.6.3"
},
"markupsafe": {
"hashes": [
@@ -387,56 +459,70 @@
},
"pebble": {
"hashes": [
"sha256:556de0f4c65f943b73ba85ab4621f18000864d42a9d562c470ce7bf396d96424",
"sha256:b0abdc8830c21307038d63454584f71c2943e542e4e9d4c86d67aebc06c3519b"
"sha256:46e02767b239a29b8150466514fabb5c6632bea8c9b7456dfdb715f4636fc8a3",
"sha256:694e1105db888f3576b8f00662f90b057cf3780e6f8b7f57955a568008d0f497"
],
"index": "pypi",
"version": "==4.6.1"
"version": "==4.6.3"
},
"pillow": {
"hashes": [
"sha256:0b2efa07f69dc395d95bb9ef3299f4ca29bcb2157dc615bae0b42c3c20668ffc",
"sha256:114f816e4f73f9ec06997b2fde81a92cbf0777c9e8f462005550eed6bae57e63",
"sha256:147bd9e71fb9dcf08357b4d530b5167941e222a6fd21f869c7911bac40b9994d",
"sha256:15a2808e269a1cf2131930183dcc0419bc77bb73eb54285dde2706ac9939fa8e",
"sha256:196560dba4da7a72c5e7085fccc5938ab4075fd37fe8b5468869724109812edd",
"sha256:1c03e24be975e2afe70dfc5da6f187eea0b49a68bb2b69db0f30a61b7031cee4",
"sha256:1fd5066cd343b5db88c048d971994e56b296868766e461b82fa4e22498f34d77",
"sha256:29c9569049d04aaacd690573a0398dbd8e0bf0255684fee512b413c2142ab723",
"sha256:2b6dfa068a8b6137da34a4936f5a816aba0ecc967af2feeb32c4393ddd671cba",
"sha256:2cac53839bfc5cece8fdbe7f084d5e3ee61e1303cccc86511d351adcb9e2c792",
"sha256:2ee77c14a0299d0541d26f3d8500bb57e081233e3fa915fa35abd02c51fa7fae",
"sha256:37730f6e68bdc6a3f02d2079c34c532330d206429f3cee651aab6b66839a9f0e",
"sha256:3f08bd8d785204149b5b33e3b5f0ebbfe2190ea58d1a051c578e29e39bfd2367",
"sha256:479ab11cbd69612acefa8286481f65c5dece2002ffaa4f9db62682379ca3bb77",
"sha256:4bc3c7ef940eeb200ca65bd83005eb3aae8083d47e8fcbf5f0943baa50726856",
"sha256:660a87085925c61a0dcc80efb967512ac34dbb256ff7dd2b9b4ee8dbdab58cf4",
"sha256:67b3666b544b953a2777cb3f5a922e991be73ab32635666ee72e05876b8a92de",
"sha256:70af7d222df0ff81a2da601fab42decb009dc721545ed78549cb96e3a1c5f0c8",
"sha256:75e09042a3b39e0ea61ce37e941221313d51a9c26b8e54e12b3ececccb71718a",
"sha256:8960a8a9f4598974e4c2aeb1bff9bdd5db03ee65fd1fce8adf3223721aa2a636",
"sha256:9364c81b252d8348e9cc0cb63e856b8f7c1b340caba6ee7a7a65c968312f7dab",
"sha256:969cc558cca859cadf24f890fc009e1bce7d7d0386ba7c0478641a60199adf79",
"sha256:9a211b663cf2314edbdb4cf897beeb5c9ee3810d1d53f0e423f06d6ebbf9cd5d",
"sha256:a17ca41f45cf78c2216ebfab03add7cc350c305c38ff34ef4eef66b7d76c5229",
"sha256:a2f381932dca2cf775811a008aa3027671ace723b7a38838045b1aee8669fdcf",
"sha256:a4eef1ff2d62676deabf076f963eda4da34b51bc0517c70239fafed1d5b51500",
"sha256:c088a000dfdd88c184cc7271bfac8c5b82d9efa8637cd2b68183771e3cf56f04",
"sha256:c0e0550a404c69aab1e04ae89cca3e2a042b56ab043f7f729d984bf73ed2a093",
"sha256:c11003197f908878164f0e6da15fce22373ac3fc320cda8c9d16e6bba105b844",
"sha256:c2a5ff58751670292b406b9f06e07ed1446a4b13ffced6b6cab75b857485cbc8",
"sha256:c35d09db702f4185ba22bb33ef1751ad49c266534339a5cebeb5159d364f6f82",
"sha256:c379425c2707078dfb6bfad2430728831d399dc95a7deeb92015eb4c92345eaf",
"sha256:cc866706d56bd3a7dbf8bac8660c6f6462f2f2b8a49add2ba617bc0c54473d83",
"sha256:d0da39795049a9afcaadec532e7b669b5ebbb2a9134576ebcc15dd5bdae33cc0",
"sha256:f156d6ecfc747ee111c167f8faf5f4953761b5e66e91a4e6767e548d0f80129c",
"sha256:f4ebde71785f8bceb39dcd1e7f06bcc5d5c3cf48b9f69ab52636309387b097c8",
"sha256:fc214a6b75d2e0ea7745488da7da3c381f41790812988c7a92345978414fad37",
"sha256:fd7eef578f5b2200d066db1b50c4aa66410786201669fb76d5238b007918fb24",
"sha256:ff04c373477723430dce2e9d024c708a047d44cf17166bf16e604b379bf0ca14"
"sha256:0412516dcc9de9b0a1e0ae25a280015809de8270f134cc2c1e32c4eeb397cf30",
"sha256:04835e68ef12904bc3e1fd002b33eea0779320d4346082bd5b24bec12ad9c3e9",
"sha256:06d1adaa284696785375fa80a6a8eb309be722cf4ef8949518beb34487a3df71",
"sha256:085a90a99404b859a4b6c3daa42afde17cb3ad3115e44a75f0d7b4a32f06a6c9",
"sha256:0b9911ec70731711c3b6ebcde26caea620cbdd9dcb73c67b0730c8817f24711b",
"sha256:10e00f7336780ca7d3653cf3ac26f068fa11b5a96894ea29a64d3dc4b810d630",
"sha256:11c27e74bab423eb3c9232d97553111cc0be81b74b47165f07ebfdd29d825875",
"sha256:11eb7f98165d56042545c9e6db3ce394ed8b45089a67124298f0473b29cb60b2",
"sha256:13654b521fb98abdecec105ea3fb5ba863d1548c9b58831dd5105bb3873569f1",
"sha256:15ccb81a6ffc57ea0137f9f3ac2737ffa1d11f786244d719639df17476d399a7",
"sha256:18a07a683805d32826c09acfce44a90bf474e6a66ce482b1c7fcd3757d588df3",
"sha256:19ec4cfe4b961edc249b0e04b5618666c23a83bc35842dea2bfd5dfa0157f81b",
"sha256:1c3ff00110835bdda2b1e2b07f4a2548a39744bb7de5946dc8e95517c4fb2ca6",
"sha256:27a330bf7014ee034046db43ccbb05c766aa9e70b8d6c5260bfc38d73103b0ba",
"sha256:2b11c9d310a3522b0fd3c35667914271f570576a0e387701f370eb39d45f08a4",
"sha256:2c661542c6f71dfd9dc82d9d29a8386287e82813b0375b3a02983feac69ef864",
"sha256:2cde7a4d3687f21cffdf5bb171172070bb95e02af448c4c8b2f223d783214056",
"sha256:2d5e9dc0bf1b5d9048a94c48d0813b6c96fccfa4ccf276d9c36308840f40c228",
"sha256:2f23b2d3079522fdf3c09de6517f625f7a964f916c956527bed805ac043799b8",
"sha256:35d27687f027ad25a8d0ef45dd5208ef044c588003cdcedf05afb00dbc5c2deb",
"sha256:35d409030bf3bd05fa66fb5fdedc39c521b397f61ad04309c90444e893d05f7d",
"sha256:4326ea1e2722f3dc00ed77c36d3b5354b8fb7399fb59230249ea6d59cbed90da",
"sha256:4abc247b31a98f29e5224f2d31ef15f86a71f79c7f4d2ac345a5d551d6393073",
"sha256:4d89a2e9219a526401015153c0e9dd48319ea6ab9fe3b066a20aa9aee23d9fd3",
"sha256:4e59e99fd680e2b8b11bbd463f3c9450ab799305d5f2bafb74fefba6ac058616",
"sha256:548794f99ff52a73a156771a0402f5e1c35285bd981046a502d7e4793e8facaa",
"sha256:56fd98c8294f57636084f4b076b75f86c57b2a63a8410c0cd172bc93695ee979",
"sha256:59697568a0455764a094585b2551fd76bfd6b959c9f92d4bdec9d0e14616303a",
"sha256:6bff50ba9891be0a004ef48828e012babaaf7da204d81ab9be37480b9020a82b",
"sha256:6cb3dd7f23b044b0737317f892d399f9e2f0b3a02b22b2c692851fb8120d82c6",
"sha256:7dbfbc0020aa1d9bc1b0b8bcf255a7d73f4ad0336f8fd2533fcc54a4ccfb9441",
"sha256:838eb85de6d9307c19c655c726f8d13b8b646f144ca6b3771fa62b711ebf7624",
"sha256:8b68f565a4175e12e68ca900af8910e8fe48aaa48fd3ca853494f384e11c8bcd",
"sha256:8f284dc1695caf71a74f24993b7c7473d77bc760be45f776a2c2f4e04c170550",
"sha256:963ebdc5365d748185fdb06daf2ac758116deecb2277ec5ae98139f93844bc09",
"sha256:a048dad5ed6ad1fad338c02c609b862dfaa921fcd065d747194a6805f91f2196",
"sha256:a1bd983c565f92779be456ece2479840ec39d386007cd4ae83382646293d681b",
"sha256:a66566f8a22561fc1a88dc87606c69b84fa9ce724f99522cf922c801ec68f5c1",
"sha256:bcb04ff12e79b28be6c9988f275e7ab69f01cc2ba319fb3114f87817bb7c74b6",
"sha256:bd24054aaf21e70a51e2a2a5ed1183560d3a69e6f9594a4bfe360a46f94eba83",
"sha256:be25cb93442c6d2f8702c599b51184bd3ccd83adebd08886b682173e09ef0c3f",
"sha256:c691b26283c3a31594683217d746f1dad59a7ae1d4cfc24626d7a064a11197d4",
"sha256:cc9d0dec711c914ed500f1d0d3822868760954dce98dfb0b7382a854aee55d19",
"sha256:ce2e5e04bb86da6187f96d7bab3f93a7877830981b37f0287dd6479e27a10341",
"sha256:ce651ca46d0202c302a535d3047c55a0131a720cf554a578fc1b8a2aff0e7d96",
"sha256:d0c8ebbfd439c37624db98f3877d9ed12c137cadd99dde2d2eae0dab0bbfc355",
"sha256:d675a876b295afa114ca8bf42d7f86b5fb1298e1b6bb9a24405a3f6c8338811c",
"sha256:dde3f3ed8d00c72631bc19cbfff8ad3b6215062a5eed402381ad365f82f0c18c",
"sha256:e5a31c07cea5edbaeb4bdba6f2b87db7d3dc0f446f379d907e51cc70ea375629",
"sha256:f514c2717012859ccb349c97862568fdc0479aad85b0270d6b5a6509dbc142e2",
"sha256:fc0db32f7223b094964e71729c0361f93db43664dd1ec86d3df217853cedda87",
"sha256:fd4fd83aa912d7b89b4b4a1580d30e2a4242f3936882a3f433586e5ab97ed0d5",
"sha256:feb5db446e96bfecfec078b943cc07744cc759893cef045aa8b8b6d6aaa8274e"
],
"markers": "python_version >= '3.6'",
"version": "==8.3.1"
"version": "==8.3.2"
},
"pycparser": {
"hashes": [
@@ -545,11 +631,11 @@
},
"typing-extensions": {
"hashes": [
"sha256:0ac0f89795dd19de6b97debb0c6af1c70987fd80a2d62d1958f7e56fcc31b497",
"sha256:50b6f157849174217d0656f99dc82fe932884fb250826c18350e159ec6cdf342",
"sha256:779383f6086d90c99ae41cf0ff39aac8a7937a9283ce0a414e5dd782f4c94a84"
"sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
"sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
"sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
],
"version": "==3.10.0.0"
"version": "==3.10.0.2"
},
"typing-utils": {
"hashes": [
@@ -561,19 +647,19 @@
},
"urllib3": {
"hashes": [
"sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4",
"sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f"
"sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece",
"sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"version": "==1.26.6"
"version": "==1.26.7"
},
"werkzeug": {
"hashes": [
"sha256:1de1db30d010ff1af14a009224ec49ab2329ad2cde454c8a708130642d579c42",
"sha256:6c1ec500dcdba0baa27600f6a22f6333d8b662d22027ff9f6202e3367413caa8"
"sha256:63d3dc1cf60e7b7e35e97fa9861f7397283b75d765afcaefd993d6046899de8f",
"sha256:aa2bb6fc8dee8d6c504c0ac1e7f5f7dc5810a9903e793b6f715a9f015bdadb9a"
],
"markers": "python_version >= '3.6'",
"version": "==2.0.1"
"version": "==2.0.2"
}
},
"develop": {}

View File

@@ -16,6 +16,7 @@ from werkzeug import exceptions
from .. import auth, log, models
from ..data import service
from ..documents import sanitize_document
bp = Blueprint("collections", __name__, url_prefix = "/collections")
LOGGER = logging.getLogger(__name__)
@@ -411,7 +412,9 @@ def get_overlap_ids(collection_id: str):
def _upload_documents(collection, docs):
doc_resp = service.post("/documents", json=docs)
for doc in docs:
sanitize_document(doc)
doc_resp = service.post("documents", json=docs)
# TODO if it failed, roll back the created collection and classifier
if not doc_resp.ok:
abort(doc_resp.status_code, doc_resp.content)

View File

@@ -1,3 +1,3 @@
# (C) 2019 The Johns Hopkins University Applied Physics Laboratory LLC.
from .bp import get_collection_ids_for, get_user_permissions, get_user_permissions_by_id, get_user_permissions_by_ids
from .bp import get_collection_ids_for, get_user_permissions, get_user_permissions_by_id, get_user_permissions_by_ids, sanitize_document

View File

@@ -6,6 +6,7 @@ import re
import typing
from flask import abort, Blueprint, jsonify, request
import lxml.html.clean
from werkzeug import exceptions
from .. import auth, collections, log, models
@@ -13,6 +14,19 @@ from ..data import service
bp = Blueprint("documents", __name__, url_prefix = "/documents")
HTML_CLEANER = lxml.html.clean.Cleaner(
page_structure=True, # keep body only
links=True, # remove <link> (not <a>)
safe_attrs_only=True, # strip out non-standard attributes
style=False, # leave <style>
javascript=True, # no javascript!
scripts=True, # no javascript!!
meta=True, # strip out <meta>
forms=True, # strip out forms
embedded=True, # strip out embedded flash, etc.,
kill_tags=["title"] # otherwise the title gets embedded at the top
)
def _document_user_can_projection():
return service.params({"projection": {
"collection_id": 1
@@ -40,6 +54,9 @@ def get_user_permissions_by_id(document_id: str) -> models.CollectionUserPermiss
def get_user_permissions_by_ids(document_ids: typing.Iterable[str]) -> typing.List[models.CollectionUserPermissions]:
return collections.get_user_permissions_by_ids(get_collection_ids_for(document_ids))
def sanitize_document(document: dict):
if document and "metadata" in document and "html_view" in document["metadata"]:
document["metadata"]["html_view"] = HTML_CLEANER.clean_html(document["metadata"]["html_view"])
@bp.route("/by_id/<doc_id>", methods = ["GET"])
@auth.login_required
@@ -250,6 +267,9 @@ def add_document():
if "has_annotated" not in doc:
doc["has_annotated"] = {user_id: False for user_id in collections_by_id[doc["collection_id"]]["annotators"]}
# sanitize
sanitize_document(doc)
# Add document(s) to database
doc_resp = service.post("documents", json=docs)
if not doc_resp.ok:

View File

@@ -91,6 +91,15 @@ class BaseConfig(object):
framework="spacy",
types=["fit", "predict", "status"]
)
),
dict(
name="service_simpletransformers",
version="1.0",
channel="service_simpletransformers",
service=dict(
framework="simpletransformers",
types=["fit", "predict", "status"]
)
)
]

View File

@@ -26,7 +26,7 @@
"src/assets"
],
"styles": [
"src/styles.css",
"src/styles.scss",
"src/themes.scss"
],
"scripts": [

View File

@@ -6936,7 +6936,8 @@
"ini": {
"version": "1.3.8",
"resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz",
"integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew=="
"integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==",
"dev": true
},
"inquirer": {
"version": "8.1.2",

View File

@@ -204,52 +204,6 @@ mat-expansion-panel-header {
box-shadow: 2px 2px 2px grey;
}
.annotation, .select {
-moz-box-shadow: 2px 2px 2px grey;
-webkit-box-shadow: 2px 2px 2px grey;
box-shadow: 2px 2px 2px grey;
border-top: 1px solid black;
border-bottom: 1px solid black;
}
.select {
background: white !important;
}
.annotationLeft, .selectLeft {
padding-left: 10px;
border-left: 1px solid black;
-moz-border-top-left-radius: 20px;
border-top-left-radius: 20px;
-moz-border-bottom-left-radius: 20px;
border-bottom-left-radius: 20px;
}
.annotationRight, .selectRight {
padding-right: 10px;
margin-right: 2px;
border-right: 1px solid black;
border-top-right-radius: 20px;
border-bottom-right-radius: 20px;
}
.select {
}
.selectLeft {
}
.selectRight {
}
.doc-label-list {
align-items: center;
}

View File

@@ -1,211 +1,217 @@
<!-- (C) 2019 The Johns Hopkins University Applied Physics Laboratory LLC.-->
<div fxFlexFill class="page-container" fxLayout="column">
<mat-toolbar>
<button class="doc-back-button" mat-icon-button matTooltip="Go back to collection details"
(click)="backToCollectionDetails()">
<mat-icon>keyboard_arrow_left</mat-icon>
</button>
<span class="page-title">Document {{doc?._id}}</span>
<span fxFlex="8px"></span>
<button class="title-toolbar-button" mat-stroked-button (click)="scroll('detailsFlag')">Details</button>
<span fxFlex="8px"></span>
<button class="title-toolbar-button" mat-stroked-button (click)="scroll('docAnnotateFlag')">Labeling</button>
<span fxFlex="8px"></span>
<button class="title-toolbar-button" mat-stroked-button (click)="scroll('imageFlag')">Image</button>
<span fxFlex="8px"></span>
<button class="title-toolbar-button" mat-stroked-button (click)="scroll('documentFlag')">Document</button>
<span fxFlex></span>
<button class="annotate-button" mat-raised-button (click)="save(false)">
<span class="material-icons">save</span>Save
</button>
<span fxFlex="10px"></span>
<button mat-raised-button (click)="save(true)">
<span class="material-icons">skip_next</span>
Save and Go to Next Document
</button>
</mat-toolbar>
<mat-toolbar>
<button class="doc-back-button" mat-icon-button matTooltip="Go back to collection details"
(click)="backToCollectionDetails()">
<mat-icon>keyboard_arrow_left</mat-icon>
</button>
<span class="page-title">Document {{doc?._id}}</span>
<span fxFlex="8px"></span>
<button class="title-toolbar-button" mat-stroked-button (click)="scroll('detailsFlag')">Details</button>
<span fxFlex="8px"></span>
<button class="title-toolbar-button" mat-stroked-button (click)="scroll('docAnnotateFlag')">Labeling</button>
<span fxFlex="8px"></span>
<button class="title-toolbar-button" mat-stroked-button (click)="scroll('imageFlag')">Image</button>
<span fxFlex="8px"></span>
<button class="title-toolbar-button" mat-stroked-button (click)="scroll('documentFlag')">Document</button>
<span fxFlex></span>
<button class="annotate-button" mat-raised-button (click)="save(false)">
<span class="material-icons">save</span>Save
</button>
<span fxFlex="10px"></span>
<button mat-raised-button (click)="save(true)">
<span class="material-icons">skip_next</span>
Save and Go to Next Document
</button>
</mat-toolbar>
<div class="page-content" id="page-content" #pageContent>
<div class="page-content" id="page-content" #pageContent>
<app-loading></app-loading>
<app-loading></app-loading>
<mat-accordion *ngIf="!loading.loading && !loading.error" [multi]="true" displayMode="flat">
<mat-expansion-panel class="mat-elevation-z0" id="detailsFlag" [expanded]="panelExpanded.detailsFlag"
(closed)="panelIsOpen('detailsFlag', false)" (opened)="panelIsOpen('detailsFlag', true)"
(afterExpand)="onAfterExpand('detailsFlag')">
<mat-expansion-panel-header>
<mat-panel-title>Document Details</mat-panel-title>
</mat-expansion-panel-header>
<mat-accordion *ngIf="!loading.loading && !loading.error" [multi]="true" displayMode="flat">
<mat-expansion-panel class="mat-elevation-z0" id="detailsFlag" [expanded]="panelExpanded.detailsFlag"
(closed)="panelIsOpen('detailsFlag', false)" (opened)="panelIsOpen('detailsFlag', true)"
(afterExpand)="onAfterExpand('detailsFlag')">
<mat-expansion-panel-header>
<mat-panel-title>Document Details</mat-panel-title>
</mat-expansion-panel-header>
<app-document-details expanded="true" [document]="doc" [collection]="collection"
(imageUrlChanged)="imageChanged($event)">
</app-document-details>
</mat-expansion-panel>
<app-document-details expanded="true" [document]="doc" [collection]="collection"
(imageUrlChanged)="imageChanged($event)">
</app-document-details>
</mat-expansion-panel>
<mat-expansion-panel class="mat-elevation-z0" id="docAnnotateFlag" [expanded]="panelExpanded.docAnnotateFlag"
(closed)="panelIsOpen('docAnnotateFlag', false)" (opened)="panelIsOpen('docAnnotateFlag', true)"
(afterExpand)="onAfterExpand('docAnnotateFlag')">
<mat-expansion-panel-header>
<mat-panel-title>Document Labeling</mat-panel-title>
</mat-expansion-panel-header>
<mat-expansion-panel class="mat-elevation-z0" id="docAnnotateFlag"
[expanded]="panelExpanded.docAnnotateFlag" (closed)="panelIsOpen('docAnnotateFlag', false)"
(opened)="panelIsOpen('docAnnotateFlag', true)" (afterExpand)="onAfterExpand('docAnnotateFlag')">
<mat-expansion-panel-header>
<mat-panel-title>Document Labeling</mat-panel-title>
</mat-expansion-panel-header>
<div class="doc-labeling-container">
<div fxLayout="row">
<mat-error *ngIf="!permissions.annotate" id="cantAnnotate">
<h3>Note: you do not have authority to change or add annotations for this document.</h3>
</mat-error>
</div>
<div class="doc-labeling-container">
<div fxLayout="row">
<mat-error *ngIf="!permissions.annotate" id="cantAnnotate">
<h3>Note: you do not have authority to change or add annotations for this document.</h3>
</mat-error>
</div>
<div class="doc-label-list" fxLayout="row">
<mat-chip-list fxFlex>
<mat-checkbox *ngFor="let annotation of myDocAnnotations;" [(ngModel)]="annotation.checked"
style="padding-right: 30px">
<mat-chip [style.background-color]="annotation.label.color"
class="shadowed cursor-pointer">
{{annotation.label.name}}</mat-chip>
</mat-checkbox>
</mat-chip-list>
</div>
</div>
</mat-expansion-panel>
<div class="doc-label-list" fxLayout="row">
<mat-chip-list fxFlex>
<mat-checkbox *ngFor="let annotation of myDocAnnotations;" [(ngModel)]="annotation.checked"
style="padding-right: 30px">
<mat-chip [style.background-color]="annotation.label.color"
class="shadowed cursor-pointer">
{{annotation.label.name}}</mat-chip>
</mat-checkbox>
</mat-chip-list>
</div>
</div>
</mat-expansion-panel>
<mat-expansion-panel class="mat-elevation-z0" id="imageFlag" class="no-padding" [expanded]="panelExpanded.imageFlag"
(closed)="panelIsOpen('imageFlag', false)" (opened)="panelIsOpen('imageFlag', true)"
(afterExpand)="onAfterExpand('imageFlag')">
<mat-expansion-panel-header>
<mat-panel-title>Image</mat-panel-title>
</mat-expansion-panel-header>
<mat-expansion-panel class="mat-elevation-z0" id="imageFlag" class="no-padding"
[expanded]="panelExpanded.imageFlag" (closed)="panelIsOpen('imageFlag', false)"
(opened)="panelIsOpen('imageFlag', true)" (afterExpand)="onAfterExpand('imageFlag')">
<mat-expansion-panel-header>
<mat-panel-title>Image</mat-panel-title>
</mat-expansion-panel-header>
<div *ngIf="doc.metadata && doc.metadata['imageUrl']" id="myDocImage" class="image-container"
[ngStyle]="{'height': (pageHeight - 48 - 127) + 'px'}" #imageContainer>
<div
style="position: absolute; top: 0px; bottom: 20px; left: 20px; right: 20px; background-color: lightgray">
<button class="full-screen-btn" mat-raised-button (click)="toggleImageFullscreen()">{{
isImageFullscreen() ? 'Close' : 'Open' }} Full
Screen</button>
<app-image-explorer [imageUrl]="doc.metadata['imageUrl']" [documentId]="doc._id"
[collectionId]="collection._id"></app-image-explorer>
</div>
</div>
</mat-expansion-panel>
<div *ngIf="doc.metadata && doc.metadata['imageUrl']" id="myDocImage" class="image-container"
[ngStyle]="{'height': (pageHeight - 48 - 127) + 'px'}" #imageContainer>
<div
style="position: absolute; top: 0px; bottom: 20px; left: 20px; right: 20px; background-color: lightgray">
<button class="full-screen-btn" mat-raised-button (click)="toggleImageFullscreen()">{{
isImageFullscreen() ? 'Close' : 'Open' }} Full
Screen</button>
<app-image-explorer [imageUrl]="doc.metadata['imageUrl']" [documentId]="doc._id"
[collectionId]="collection._id"></app-image-explorer>
</div>
</div>
</mat-expansion-panel>
<mat-expansion-panel class="mat-elevation-z0" id="documentFlag" class="no-padding" [expanded]="panelExpanded.documentFlag"
(closed)="panelIsOpen('documentFlag', false)" (opened)="panelIsOpen('documentFlag', true)"
(afterExpand)="onAfterExpand('documentFlag')">
<mat-expansion-panel-header>
<mat-panel-title>Document</mat-panel-title>
</mat-expansion-panel-header>
<mat-expansion-panel class="mat-elevation-z0" id="documentFlag" class="no-padding"
[expanded]="panelExpanded.documentFlag" (closed)="panelIsOpen('documentFlag', false)"
(opened)="panelIsOpen('documentFlag', true)" (afterExpand)="onAfterExpand('documentFlag')">
<mat-expansion-panel-header>
<mat-panel-title>Document</mat-panel-title>
</mat-expansion-panel-header>
<div class="doc-content-container" [ngStyle]="{'height': (pageHeight - 48 - 127) + 'px'}">
<div class="filter-bar">
<button mat-icon-button (click)="showList = !showList">
<mat-icon>list</mat-icon>
</button>
<div class="doc-content-container" [ngStyle]="{'height': (pageHeight - 48 - 127) + 'px'}">
<div class="filter-bar">
<button mat-icon-button (click)="showList = !showList">
<mat-icon>list</mat-icon>
</button>
<span fxFlex="22px"></span>
<span fxFlex="22px"></span>
<span *ngIf="others.length === 0">No annotations from other users.</span>
<div *ngIf="others.length > 0" id="others">
<mat-form-field fxFlex="180px" floatLabel="never">
<mat-label>Show Annotations:</mat-label>
<mat-select id="othersAnnotations" value="" #othersSelect>
<mat-option value="" (click)="showAnnotationsOf(othersSelect, null)">
Mine
</mat-option>
<mat-option *ngFor="let other of others" [value]="other"
(click)="showAnnotationsOf(othersSelect, other)">
{{ auth.getUserDisplayName(other) }}</mat-option>
</mat-select>
</mat-form-field>
<span fxFlex="10px"></span>
<mat-chip-list
*ngIf="othersSelect.value && othersDocAnnotations.hasOwnProperty(othersSelect.value) && othersDocAnnotations[othersSelect.value].length > 0">
<mat-chip *ngFor="let label of othersDocAnnotations[othersSelect.value]"
[style.background-color]="getColorFor(label)">
{{label}}
</mat-chip>
</mat-chip-list>
<span
*ngIf="othersSelect.value && (!othersDocAnnotations.hasOwnProperty(othersSelect.value) || othersDocAnnotations[othersSelect.value].length === 0)">No
labels for this document.</span>
</div>
<span fxFlex></span>
<div>
<span>
<b>
Document Overall Agreement:
</b>
<span *ngIf="ann_agreement != null && ann_agreement != 'null'">{{ann_agreement |
percent:'1.2-2'}}</span>
<span *ngIf="ann_agreement == null || ann_agreement == 'null'">N/A</span>
</span>
</div>
</div>
<span *ngIf="others.length === 0">No annotations from other users.</span>
<div *ngIf="others.length > 0" id="others">
<mat-form-field fxFlex="180px" floatLabel="never">
<mat-label>Show Annotations:</mat-label>
<mat-select id="othersAnnotations" value="" #othersSelect>
<mat-option value="" (click)="showAnnotationsOf(othersSelect, null)">
Mine
</mat-option>
<mat-option *ngFor="let other of others" [value]="other"
(click)="showAnnotationsOf(othersSelect, other)">
{{ auth.getUserDisplayName(other) }}</mat-option>
</mat-select>
</mat-form-field>
<span fxFlex="10px"></span>
<mat-chip-list
*ngIf="othersSelect.value && othersDocAnnotations.hasOwnProperty(othersSelect.value) && othersDocAnnotations[othersSelect.value].length > 0">
<mat-chip *ngFor="let label of othersDocAnnotations[othersSelect.value]"
[style.background-color]="getColorFor(label)">
{{label}}
</mat-chip>
</mat-chip-list>
<span
*ngIf="othersSelect.value && (!othersDocAnnotations.hasOwnProperty(othersSelect.value) || othersDocAnnotations[othersSelect.value].length === 0)">No
labels for this document.</span>
</div>
<span fxFlex></span>
<div>
<span>
<b>
Document Overall Agreement:
</b>
<span *ngIf="ann_agreement != null && ann_agreement != 'null'">{{ann_agreement |
percent:'1.2-2'}}</span>
<span *ngIf="ann_agreement == null || ann_agreement == 'null'">N/A</span>
</span>
</div>
</div>
<div class="annotate-area">
<div *ngIf="showList" class="annotate-table-container" fxFlex="30%">
<app-ner-annotation-table [labels]="availableLabels" [data]="nerData"
(remove)="removeAnnotation($event)" [readOnly]="showingAnnotationsFor !== null">
</app-ner-annotation-table>
</div>
<div class="annotate-doc-container" fxFlex>
<div class="annotate-doc-toolbar" fxLayout="row">
<span class="mat-title">NER Annotations</span>
<span fxFlex></span>
<span *ngIf="showingAnnotationsFor === null">Click to select text; right-click to
annotate
selection</span>
<span *ngIf="showingAnnotationsFor !== null">Showing
{{ auth.getUserDisplayName(showingAnnotationsFor) }}'s
annotations in read-only mode</span>
<span fxFlex="10px"></span>
<mat-menu #settingsMenu="matMenu" id="settings">
<button>
<mat-checkbox matMenuItem [(ngModel)]="settingMonospace"
(click)="$event.stopPropagation()" class="mat-menu-item">
Monospace font
</mat-checkbox>
</button>
</mat-menu>
<button mat-icon-button [matMenuTriggerFor]="settingsMenu" id="settingsButton"
matTooltip="Document/annotation settings">
<mat-icon>settings</mat-icon>
</button>
</div>
<div class="annotate-area">
<div *ngIf="showList" class="annotate-table-container" fxFlex="30%">
<app-ner-annotation-table [labels]="availableLabels" [data]="nerData"
(remove)="removeAnnotation($event)" [readOnly]="showingAnnotationsFor !== null">
</app-ner-annotation-table>
</div>
<div class="annotate-doc-container" fxFlex>
<div class="annotate-doc-toolbar" fxLayout="row">
<span class="mat-title">NER Annotations</span>
<span fxFlex></span>
<span *ngIf="showingAnnotationsFor === null">Click to select text; right-click to
annotate
selection</span>
<span *ngIf="showingAnnotationsFor !== null">Showing
{{ auth.getUserDisplayName(showingAnnotationsFor) }}'s
annotations in read-only mode</span>
<span fxFlex="10px"></span>
<mat-menu #settingsMenu="matMenu" id="settings">
<button>
<mat-checkbox matMenuItem [(ngModel)]="settingMonospace"
(click)="$event.stopPropagation()" class="mat-menu-item">
Monospace font
</mat-checkbox>
</button>
</mat-menu>
<button mat-icon-button [matMenuTriggerFor]="settingsMenu" id="settingsButton"
matTooltip="Document/annotation settings">
<mat-icon>settings</mat-icon>
</button>
</div>
<div #docElem id="doc" class="cursor-pointer">
<!-- set word-start and word-end to help with testing -->
<span #wordsList class="word" *ngFor="let word of nerData.words" [id]="word.id"
[attr.word-start]="word.start" [attr.word-end]="word.end"
[matTooltip]="getWordTooltip(word)" (mousedown)="mousedown($event, word)"
(mouseover)="mouseover($event, word)" (mouseout)="mouseout($event, word)"
(mouseup)="mouseup($event, word)" (click)="click($event, word)"
(contextmenu)="contextMenu($event, word)">{{ word.text }}</span>
</div>
<div #docElem id="doc" class="cursor-pointer">
<span id="words-html">
<div *ngIf="styleHtml" [innerHtml]="styleHtml"></div>
<div *ngIf="contentHtml" [innerHtml]="contentHtml"></div>
</span>
<!-- set word-start and word-end to help with testing -->
<ng-container *ngIf="!contentHtml">
<span #wordsList class="word" *ngFor="let word of nerData.words" [id]="word.id"
[attr.word-start]="word.start" [attr.word-end]="word.end"
[matTooltip]="getWordTooltip(word)" (mousedown)="mousedown($event, word)"
(mouseover)="mouseover($event, word)" (mouseout)="mouseout($event, word)"
(mouseup)="mouseup($event, word)" (click)="click($event, word)"
(contextmenu)="contextMenu($event, word)">{{ word.text }}</span>
</ng-container>
</div>
<div *ngIf="!allowOverlappingNerAnnotations"> (Note: overlapping annotations are not allowed
for
this
collection.)
</div>
<div *ngIf="!allowOverlappingNerAnnotations"> (Note: overlapping annotations are not allowed
for
this
collection.)
</div>
<div #popoverTemplate id="popoverTemplate" class="popover" hidden>
<mat-chip-list>
<mat-chip *ngFor="let label of availableLabels"
[style.background-color]="label.color"
class="shadowed cursor-pointer doc-label-chip">{{label.name}}</mat-chip>
</mat-chip-list>
<div style="padding: 2px">
<button mat-raised-button color="warn">
Remove / Reset
</button>
</div>
</div>
</div>
</div>
</div>
<div #popoverTemplate id="popoverTemplate" class="popover" hidden>
<mat-chip-list>
<mat-chip *ngFor="let label of availableLabels"
[style.background-color]="label.color"
class="shadowed cursor-pointer doc-label-chip">{{label.name}}</mat-chip>
</mat-chip-list>
<div style="padding: 2px">
<button mat-raised-button color="warn">
Remove / Reset
</button>
</div>
</div>
</div>
</div>
</div>
</mat-expansion-panel>
</mat-accordion>
</div>
</mat-expansion-panel>
</mat-accordion>
</div>
</div>

View File

@@ -11,6 +11,7 @@ export class NerData {
public words: Word[];
public annotations: NerAnnotation[];
private wordIndices: object;
private wordMap: { [id: string]: Word } = {};
constructor() {
this.changed = new EventEmitter<NerAnnotation[]>();
@@ -22,7 +23,18 @@ export class NerData {
public setWordsAndAnnotations(words: Word[], annotations: NerAnnotation[]) {
this.words = words;
this.setAnnotations(annotations);
console.log(this.words);
this.wordMap = {};
for(const word of words) {
this.wordMap[word.id] = word;
}
}
public getWordById(id: string) {
return this.wordMap[id];
}
public setAnnotations(annotations: NerAnnotation[]) {
this.annotations = annotations.slice();

View File

@@ -89,6 +89,9 @@ export class NerSelection {
this.words[0].elem.classList.remove("selectLeft");
for(let i = this.words[0].index - 1; i >= word.index; i--) {
const docWord = nerData.words[i];
if(!docWord?.elem) {
continue;
}
docWord.elem.classList.add("select");
if(i === word.index) {
docWord.elem.classList.add("selectLeft");
@@ -101,6 +104,9 @@ export class NerSelection {
this.words[this.words.length - 1].elem.classList.remove("selectRight");
for(let i = this.words[this.words.length - 1].index + 1; i <= word.index; i++) {
const docWord = nerData.words[i];
if(!docWord?.elem) {
continue;
}
docWord.elem.classList.add("select");
if(i === word.index) {
docWord.elem.classList.add("selectRight");

View File

@@ -1,56 +1,62 @@
<!-- (C) 2019 The Johns Hopkins University Applied Physics Laboratory LLC.-->
<div class="detail-container">
<table class="metadata-table">
<tr class="space-under">
<td><b>Document ID:</b></td>
<td>{{document?._id}}</td>
</tr>
<tr class="space-under">
<td><b>Creation Date:</b></td>
<td>{{document?._created}}</td>
</tr>
<tr class="space-under">
<td><b>Last Updated:</b></td>
<td>{{document?._updated}}</td>
</tr>
<tr class="space-under">
<td><b>Creator:</b></td>
<td>{{auth.getUserDisplayName(document?.creator_id)}}</td>
</tr>
<tr class="space-under">
<td><b>Metadata:</b></td>
<td>
<table>
<ng-container *ngIf="document && document?.metadata">
<tr *ngFor="let item of document?.metadata | keyvalue">
<td><b>{{item.key}}</b></td>
<td *ngIf="item.key !== 'imageUrl'">{{item.value}}</td>
<td *ngIf="item.key === 'imageUrl'">
<a [href]="collections.collectionImageUrl(collection._id, item.value)" target="_blank">
{{item.value}}
<span
*ngIf="item.value !== collections.collectionImageUrl(collection._id, item.value)">({{collections.collectionImageUrl(collection._id, item.value)}})</span>
</a>
<div><button *ngIf="permissions.modify_document_metadata" mat-button mat-raised-button
(click)="updateImage()">Update document image</button></div>
</td>
</tr>
</ng-container>
<tr
*ngIf="permissions.modify_document_metadata && (!document || !document.metadata || !document.metadata.hasOwnProperty('imageUrl'))">
<td><b>imageUrl</b></td>
<td><button mat-button mat-raised-button (click)="updateImage()">Update document
image</button></td>
</tr>
</table>
</td>
</tr>
<tr>
<td><b>Collection:</b></td>
<td *ngIf="!collection">Loading...</td>
<td *ngIf="collection">{{ collection.hasTitle() ? collection.getTitle() + " (" : "" }}<a href="#"
[routerLink]="['/' + PATHS.collection.details, document?.collection_id]">{{document?.collection_id}}</a>{{ collection.hasTitle() ? ")" : "" }}
</td>
</tr>
</table>
<table class="metadata-table">
<tr class="space-under">
<td><b>Document ID:</b></td>
<td>{{document?._id}}</td>
</tr>
<tr class="space-under">
<td><b>Creation Date:</b></td>
<td>{{document?._created}}</td>
</tr>
<tr class="space-under">
<td><b>Last Updated:</b></td>
<td>{{document?._updated}}</td>
</tr>
<tr class="space-under">
<td><b>Creator:</b></td>
<td>{{auth.getUserDisplayName(document?.creator_id)}}</td>
</tr>
<tr class="space-under">
<td><b>Metadata:</b></td>
<td>
<table>
<ng-container *ngIf="document && document?.metadata">
<tr *ngFor="let item of document?.metadata | keyvalue">
<ng-container *ngIf="item.key != 'html_view'">
<td><b>{{item.key}}</b></td>
<td *ngIf="item.key !== 'imageUrl'">{{item.value}}</td>
<td *ngIf="item.key === 'imageUrl'">
<a [href]="collections.collectionImageUrl(collection._id, item.value)"
target="_blank">
{{item.value}}
<span
*ngIf="item.value !== collections.collectionImageUrl(collection._id, item.value)">({{collections.collectionImageUrl(collection._id,
item.value)}})</span>
</a>
<div><button *ngIf="permissions.modify_document_metadata" mat-button
mat-raised-button (click)="updateImage()">Update document image</button>
</div>
</td>
</ng-container>
</tr>
</ng-container>
<tr
*ngIf="permissions.modify_document_metadata && (!document || !document.metadata || !document.metadata.hasOwnProperty('imageUrl'))">
<td><b>imageUrl</b></td>
<td><button mat-button mat-raised-button (click)="updateImage()">Update document
image</button></td>
</tr>
</table>
</td>
</tr>
<tr>
<td><b>Collection:</b></td>
<td *ngIf="!collection">Loading...</td>
<td *ngIf="collection">{{ collection.hasTitle() ? collection.getTitle() + " (" : "" }}<a href="#"
[routerLink]="['/' + PATHS.collection.details, document?.collection_id]">{{document?.collection_id}}</a>{{
collection.hasTitle() ? ")" : "" }}
</td>
</tr>
</table>
</div>

View File

@@ -1,6 +1,6 @@
/*(C) 2019 The Johns Hopkins University Applied Physics Laboratory LLC. */
import { Component, OnInit, ViewChild, Input, Output, EventEmitter } from '@angular/core';
import { Component, OnInit, ViewChild, Input, Output, EventEmitter, ChangeDetectorRef } from '@angular/core';
import { MatPaginator } from '@angular/material/paginator';
import { MatSort, MatSortable } from '@angular/material/sort';
import { MatTable, MatTableDataSource } from '@angular/material/table';
@@ -44,7 +44,9 @@ export class NERAnnotationTableComponent implements OnInit {
public dataSource: MatTableDataSource<NerAnnotation>;
constructor() {
constructor(
private cdr: ChangeDetectorRef
) {
this.dataSource = new MatTableDataSource<NerAnnotation>();
this.dataSource.filterPredicate = (annotation, value): boolean => {
if(annotation.label.toLowerCase().includes(value)) {
@@ -60,6 +62,7 @@ export class NERAnnotationTableComponent implements OnInit {
ngOnInit() {
this.data.changed.subscribe((res: NerAnnotation[]) => {
this.dataSource.data = res;
this.cdr.detectChanges();
});
this.dataSource.sortingDataAccessor = (annotation: NerAnnotation, property: string) => {
switch(property) {

View File

@@ -1,5 +1,6 @@
// (C) 2019 The Johns Hopkins University Applied Physics Laboratory LLC.
import * as _ from "lodash";
import { Observable } from "rxjs";
export class Word {
@@ -59,5 +60,19 @@ export class Word {
return words;
}
public static parseWordObjectsFromHtml(elems: HTMLElement[]): Word[] {
const words = [];
_.forEach(elems, (elem: HTMLElement) => {
console.log(elem);
let id = elem.getAttribute('ID');
let parts = id.split('_');
let start = parts[1];
//let end = parts[2];
let wordObj = new Word(+start, elem.innerHTML, words.length);
words.push(wordObj);
});
return words;
}
}

View File

@@ -58,3 +58,39 @@ td.space-left {
.spacer {
flex: 1 1 auto;
}
.annotate-area {
.annotation, .select {
-moz-box-shadow: 2px 2px 2px grey;
-webkit-box-shadow: 2px 2px 2px grey;
box-shadow: 2px 2px 2px grey;
border-top: 1px solid black;
border-bottom: 1px solid black;
}
.select {
background: rgba(255,255,255,0.4) !important;
}
.annotationLeft, .selectLeft {
padding-left: 10px;
border-left: 1px solid black;
-moz-border-top-left-radius: 20px;
border-top-left-radius: 20px;
-moz-border-bottom-left-radius: 20px;
border-bottom-left-radius: 20px;
}
.annotationRight, .selectRight {
padding-right: 10px;
margin-right: 2px;
border-right: 1px solid black;
border-top-right-radius: 20px;
border-bottom-right-radius: 20px;
}
}

View File

@@ -76,8 +76,9 @@ WORKDIR ${ROOT_DIR}
# pipenv causing container to fail to rebuild if spacy installed previously
#Install python requirements
COPY Pipfile Pipfile.lock ./
RUN REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt pipenv install --dev --system --deploy
RUN REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt PIPENV_INSTALL_TIMEOUT=30 \
pipenv install --dev --system --deploy
RUN python3 -m nltk.downloader punkt
#Copy contents of pipeline folder to docker
COPY pine ./pine

View File

@@ -22,6 +22,10 @@ scikit-multilearn = "~=0.2.0"
python-json-logger = "~=2.0.2"
overrides = "~=6.1.0"
typing-extensions = "~=3.10.0.0"
pandas = "~=1.3.3"
simpletransformers = "~=0.61.13"
torch = {file = "https://download.pytorch.org/whl/cpu/torch-1.9.0%2Bcpu-cp38-cp38-linux_x86_64.whl"}
nltk = "~=3.6.7"
[requires]
python_version = "3.8"

2263
pipelines/Pipfile.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -9,8 +9,10 @@ fi
set -x
pipenv run python3 -m nltk.downloader punkt
PIDS=""
for SERVICE in opennlp corenlp spacy; do
for SERVICE in simpletransformers opennlp corenlp spacy; do
AL_PIPELINE=${SERVICE} pipenv run python3 -m pine.pipelines.run_service &
PIDS="${PIDS} $!"
done

View File

@@ -10,6 +10,15 @@ from .shared.config import ConfigBuilder
logger = logging.getLogger(__name__)
config = ConfigBuilder.get_config()
class EveDocsAndAnnotations:
def __init__(self):
self.all_labels: typing.List[str] = []
self.documents: typing.List[str] = []
self.annotations: typing.List = []
self.doc_ids: typing.List[str] = []
self.ann_ids: typing.List[str] = []
class EveClient(object):
eve_headers = {'Content-Type': 'application/json'}
@@ -105,7 +114,7 @@ class EveClient(object):
}
return self._get_documents_map(params)
def get_docs_with_annotations(self, collection_id: str, doc_map: typing.Dict[str, str]) -> typing.Tuple[typing.List[str], typing.List[str], typing.List[str], typing.List[str]]:
def get_docs_with_annotations(self, collection_id: str, doc_map: typing.Dict[str, str]) -> EveDocsAndAnnotations:
"""Gets document and annotation data. Only non-overlapping documents are returned.
:param collection_id: str: the ID of the collection
@@ -116,10 +125,11 @@ class EveClient(object):
ann_ids is a list of the annotation IDs
:rtype: tuple
"""
doc_ids = list()
documents = []
ann_ids = list()
labels = []
data = EveDocsAndAnnotations()
# get all labels from collection object
collection = self.get_obj("collections", collection_id)
data.all_labels = collection["labels"]
#get annotations and make data
query = 'annotations?where={"collection_id":"%s"}' % (collection_id)
@@ -132,15 +142,15 @@ class EveClient(object):
# remove overlaps
if docid not in doc_map:
continue
doc_ids.append(docid)
documents.append(doc_map[docid])
ann_ids.append(a["_id"])
labels.append(a["annotation"])
data.doc_ids.append(docid)
data.documents.append(doc_map[docid])
data.ann_ids.append(a["_id"])
data.annotations.append(a["annotation"])
if query is None:
break
return documents, labels, doc_ids, ann_ids
return data
def update(self, resource, id, etag, update_obj):
headers = {'Content-Type': 'application/json', 'If-Match': etag}

View File

@@ -12,14 +12,31 @@ from skmultilearn.model_selection import IterativeStratification
from sklearn.preprocessing import MultiLabelBinarizer
from itertools import chain
from .EveClient import EveClient
from .EveClient import EveClient, EveDocsAndAnnotations
from . import RankingFunctions as rank
from .pipeline import EvaluationMetrics, StatMetrics
from .pmap_ner import NER
from .shared.config import ConfigBuilder
logger = logging.getLogger(__name__)
config = ConfigBuilder.get_config()
class FiveFoldResult(object):
def __init__(self):
self.metrics: typing.List[EvaluationMetrics] = []
# store list of documents ids per fold
self.folds: typing.List[typing.List] = []
self.average_metrics: typing[dict, StatMetrics] = {}
def serialize_metrics(self):
return [x.serialize() for x in self.metrics]
def serialize_folds(self):
return list(self.folds) # make a copy
def serialize_average_metrics(self):
return {label: self.average_metrics[label].serialize() for label in self.average_metrics.keys()}
class ner_api(object):
@@ -43,16 +60,14 @@ class ner_api(object):
status["has_trained"] = "filename" in classifier_obj
return status
def perform_fold(self, model: NER, train_data, test_data, **pipeline_parameters):
model.fit(train_data[0], train_data[1], **pipeline_parameters)
results = model.evaluate(test_data[0], test_data[1], range(0, len(test_data[0])))
def perform_fold(self, model: NER, all_labels: typing.List[str], train_data, test_data, **pipeline_parameters) -> EvaluationMetrics:
model.fit(train_data[0], train_data[1], all_labels, **pipeline_parameters)
results = model.evaluate(test_data[0], test_data[1], all_labels)
return results
def perform_five_fold(self, model: NER, documents, annotations, doc_ids, **pipeline_parameters):
metrics = list()
# store list of documents ids per fold
folds = list()
def perform_five_fold(self, model: NER, all_labels: typing.List[str], documents, annotations, doc_ids: typing.List[str], **pipeline_parameters) -> FiveFoldResult:
results = FiveFoldResult()
# turning into numpy arrays to be able to access values with index array
documents_np_array = np.array(documents)
annotations_np_array = np.array(annotations, dtype=object)
@@ -84,51 +99,39 @@ class ner_api(object):
train_documents = documents_np_array[train_index]
test_documents = documents_np_array[test_index]
fold_metrics = self.perform_fold(model, [train_documents.tolist(), train_annotations.tolist()],
[test_documents.tolist(), test_annotations.tolist()], **pipeline_parameters)
fold_metrics = self.perform_fold(model, all_labels,
[train_documents.tolist(), train_annotations.tolist()],
[test_documents.tolist(), test_annotations.tolist()],
**pipeline_parameters)
# saving docs used to train fold
fold_doc_ids = doc_ids_np_array[train_index]
folds.append(fold_doc_ids.tolist())
results.folds.append(fold_doc_ids.tolist())
# saving fold metrics
metrics.append(fold_metrics)
results.metrics.append(fold_metrics)
for key in fold_metrics.keys():
for key in fold_metrics.labels.keys():
if key not in total_metrics:
total_metrics[key] = {"FN": 0, "FP": 0, "TP": 0, "TN": 0, "f1": 0, "precision": 0, "recall": 0, "acc": 0}
total_metrics[key]["FN"] = total_metrics[key]["FN"] + fold_metrics[key]["FN"]
total_metrics[key]["FP"] = total_metrics[key]["FP"] + fold_metrics[key]["FP"]
total_metrics[key]["TP"] = total_metrics[key]["TP"] + fold_metrics[key]["TP"]
total_metrics[key]["TN"] = total_metrics[key]["TN"] + fold_metrics[key]["TN"]
total_metrics[key] = StatMetrics()
total_metrics[key].fn += fold_metrics.labels[key].fn
total_metrics[key].fp += fold_metrics.labels[key].fp
total_metrics[key].tp += fold_metrics.labels[key].tp
total_metrics[key].tn += fold_metrics.labels[key].tn
average_metrics = {}
for label in total_metrics.keys():
avg_metric = {}
avg_metric["FN"] = total_metrics[label]["FN"] / 5
avg_metric["FP"] = total_metrics[label]["FP"] / 5
avg_metric["TP"] = total_metrics[label]["TP"] / 5
avg_metric["TN"] = total_metrics[label]["TN"] / 5
if (avg_metric["TP"] + avg_metric["FN"]) != 0:
avg_metric["recall"] = avg_metric["TP"] / (avg_metric["TP"] + avg_metric["FN"])
else:
avg_metric["recall"] = 1.0
if (avg_metric["TP"] + avg_metric["FP"]) != 0:
avg_metric["precision"] = avg_metric["TP"] / (avg_metric["TP"] + avg_metric["FP"])
else:
avg_metric["precision"] = 0.0
if (avg_metric["precision"] + avg_metric["recall"]) != 0:
avg_metric["f1"] = 2 * (avg_metric["precision"] * avg_metric["recall"]) / (avg_metric["precision"] + avg_metric["recall"])
else:
avg_metric["f1"] = 0
avg_metric["acc"] = (avg_metric["TP"] + avg_metric["TN"]) / (avg_metric["TP"] + avg_metric["TN"] + avg_metric["FP"] + avg_metric["FN"])
avg_metric = StatMetrics()
avg_metric.fn = total_metrics[label].fn / 5
avg_metric.fp = total_metrics[label].fp / 5
avg_metric.tp = total_metrics[label].tp / 5
avg_metric.tn = total_metrics[label].tn / 5
avg_metric.calc_precision_recall_f1_acc()
average_metrics[label] = avg_metric
results.average_metrics[label] = avg_metric
return metrics, folds, average_metrics
return results
def get_document_ranking(self, model: NER, doc_map: typing.Dict[str, str], doc_ids: typing.List[str]) -> typing.List[str]:
"""Calculates document rankings and returns document IDs sorted by ranking.
@@ -189,19 +192,21 @@ class ner_api(object):
# get documents where overlap is 0
doc_map = self.eve_client.get_documents(collection_id)
# get documents with its annotations where overlap is 0
documents, labels, doc_ids, ann_ids = self.eve_client.get_docs_with_annotations(collection_id, doc_map)
eve_data = self.eve_client.get_docs_with_annotations(collection_id, doc_map)
# instantiate model
classifier = NER(pipeline_name)
# get folds information
metrics, folds, averages = self.perform_five_fold(classifier, documents, labels, doc_ids, **pipeline_parameters)
fold_results = self.perform_five_fold(classifier, eve_data.all_labels,
eve_data.documents, eve_data.annotations,
eve_data.doc_ids, **pipeline_parameters)
logger.info("Starting to train classifier for {} pipeline".format(pipeline_name))
fit_results = classifier.fit(documents, labels, **pipeline_parameters)
fit_results = classifier.fit(eve_data.documents, eve_data.annotations, eve_data.all_labels, **pipeline_parameters)
results = {
"fit": fit_results,
"average_metrics": averages,
"average_metrics": fold_results.serialize_average_metrics(),
"updated_objects": {}
}
@@ -221,11 +226,11 @@ class ner_api(object):
# update classifier metrics on eve
metrics_updated_obj = {
'trained_classifier_db_version': classifier_obj['_version']+1,
'documents': list(set(chain.from_iterable(folds))),
'annotations': list(ann_ids),
'folds': list(folds),
'metrics': list(metrics),
'metric_averages': dict(averages),
'documents': list(set(chain.from_iterable(fold_results.folds))),
'annotations': list(eve_data.ann_ids),
'folds': fold_results.serialize_folds(),
'metrics': fold_results.serialize_metrics(),
'metric_averages': fold_results.serialize_average_metrics(),
'filename': filename
}
if not self.eve_client.update('metrics', metrics_obj["_id"], metrics_obj['_etag'], metrics_updated_obj):
@@ -234,7 +239,7 @@ class ner_api(object):
results["updated_objects"]["metrics"] = [metrics_obj["_id"]]
# re rank documents
ranks = self.get_document_ranking(classifier, doc_map, doc_ids)
ranks = self.get_document_ranking(classifier, doc_map, eve_data.doc_ids)
logger.info("Performing document rankings")
# Save updates to eve

View File

@@ -11,7 +11,7 @@ import uuid
from overrides import overrides
from .pipeline import Pipeline, NerPrediction, DocumentPredictions, NerPredictionProbabilities, DocumentPredictionProbabilities
from .pipeline import Pipeline, NerPrediction, DocumentPredictions, NerPredictionProbabilities, DocumentPredictionProbabilities, EvaluationMetrics, StatMetrics
from .shared.config import ConfigBuilder
config = ConfigBuilder.get_config()
@@ -161,7 +161,7 @@ class corenlp_NER(Pipeline):
}
@overrides
def fit(self, X, y, **params) -> dict:
def fit(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **params) -> dict:
default_params = self.__default_fit_params.copy()
#format input data into tsv file for ner to train on
try:
@@ -303,7 +303,7 @@ wordShape=""" + default_params["word_shape"] + """
@overrides
#TODO
def next_example(self, X, Xid):
def next_example(self, X: typing.Iterable[str], Xid):
return
## EXTRA METHODS TO HELP WITH THE corenlp PIPELINE ##
@@ -313,7 +313,7 @@ wordShape=""" + default_params["word_shape"] + """
#Takes input data and formats it to be easier to use in the corenlp pipeline
#ASSUMES DATA FOLLOWS FORMAT X = [string], y = [[(start offset, stop offset, label), ()], ... []]
#Currently cannot assign more than one label to the same word
def format_data(self, X, y):
def format_data(self, X: typing.Iterable[str], y):
out = []
for doc,ann in zip(X,y):
#Extract labeled entities from doc
@@ -352,7 +352,7 @@ wordShape=""" + default_params["word_shape"] + """
@overrides
#models must be saved with extension ".ser.gz"
def save_model(self, model_name):
def save_model(self, model_name: str):
if not model_name.endswith(".ser.gz"):
logger.warn('WARNING: model_name must end in .ser.gz, adding...')
model_name = model_name + ".ser.gz"
@@ -363,7 +363,7 @@ wordShape=""" + default_params["word_shape"] + """
@overrides
#properties can be exported/imported during train
def load_model(self, model_name):
def load_model(self, model_name: str):
#TODO: what to do if model doesn't exist?
if not model_name.endswith(".ser.gz"):
logger.warn('WARNING: model_name must end in .ser.gz, adding...')
@@ -390,31 +390,31 @@ wordShape=""" + default_params["word_shape"] + """
#Calculates Precision, Recall, and F1 Score for model based on input test data
#WARNING: currently works for BioNLP data, no guarantees with other datasets
def evaluate(self, X, y, Xid, verbose=False):
known_labels = set()
for anns in y:
for ann in anns:
known_labels.add(ann[2])
stats = {}
# WARNING: this is currently broken, but this whole pipeline is broken
@overrides
def evaluate(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], verbose=False, **kwargs) -> EvaluationMetrics:
try:
train_data = self.format_data(X, y)
if len(train_data) == 0 or train_data is None:
raise Exception("ERROR: could not format input correctly")
except:
raise Exception("ERROR: could not format input correctly")
known_labels = set()
for anns in y:
for ann in anns:
known_labels.add(ann[2])
metrics = EvaluationMetrics()
test_text = ''
for doc in X:
test_text = test_text + doc + '\n\n'
#rest of code tries to recreate calculations as this line, which can't be called more than once for some reason
#results = self.__crf.classifyAndWriteAnswers(self.__java_String(self.__test_file), True)
#print(test_text)
results = self.__crf.classify(self.__java_String(test_text))
#Calculate evaluation by iterating through answer key and matching tokens to classifier output
s = 0
w = 0
@@ -474,7 +474,7 @@ wordShape=""" + default_params["word_shape"] + """
#(likely the current answer token doesn't exactly match the guess token, see `` vs '')
if i+1 < len(doc):
next_gold = doc[i+1]
elif i >= len(doc) and d+1 < len(test_data):
elif i >= len(doc) and d+1 < len(test_data): # this is broken
next_gold = test_data[d+1][0]
else:
next_gold = (None, None)
@@ -488,32 +488,30 @@ wordShape=""" + default_params["word_shape"] + """
known_labels.add(pred)
# Per token metriccs
# Per token metrics
for label in known_labels:
if label not in stats:
stats[label] = [0, 0, 0, 0]
if label not in metrics.labels:
metrics.labels[label] = StatMetrics()
if gold == pred and gold != 'O':
stats[gold][0] = stats[gold][0] + 1
metrics.labels[gold].tp += 1
for label in known_labels:
if label != gold:
stats[label][3] = stats[label][3] + 1
metrics.labels[label].tn += 1
elif gold == 'O' and pred != 'O':
stats[pred][1] = stats[pred][1] + 1
metrics.labels[pred].fp += 1
for label in known_labels:
if label != pred:
stats[label][3] = stats[label][3] + 1
metrics.labels[label].tn += 1
elif pred == 'O' and gold != 'O':
stats[gold][2] = stats[gold][2] + 1
metrics.labels[gold].fn += 1
for label in known_labels:
if label != gold:
stats[label][3] = stats[label][3] + 1
metrics.labels[label].tn += 1
else:
for label in known_labels:
stats[label][3] = stats[label][3] + 1
metrics.labels[label].tn += 1
# Per annotation metrics
@@ -555,54 +553,22 @@ wordShape=""" + default_params["word_shape"] + """
#ONLY USED FOR PER ANNOTATION METRICS
# del stats['O']
TP = 0
TN = 0
FP = 0
FN = 0
for key in stats:
TP = TP + stats[key][0]
FP = FP + stats[key][1]
FN = FN + stats[key][2]
TN = TN + stats[key][3]
stats['Totals'] = [TP, FP, FN, TN]
for key in metrics.labels:
metrics.totals.tp += metrics.labels[key].tp
metrics.totals.fp += metrics.labels[key].fp
metrics.total.fn += metrics.labels[key].fn
metrics.total.tn += metrics.labels[key].tn
#print(test_data[-1])
for key in stats:
TP = stats[key][0]
FP = stats[key][1]
FN = stats[key][2]
# Only generated when using per token metrics
TN = stats[key][3]
if (TP+FN) != 0:
recall = TP/(TP+FN)
else:
recall = 1.0
if (TP+FP) != 0:
precision = TP/(TP+FP)
else:
precision = 0.0
if (precision + recall) != 0:
f1 = 2 * (precision * recall) / (precision + recall)
else:
f1 = 0
# Acc Only works when using per token metrics which generates TN
if (TP + FN + FP + TN) != 0:
acc = (TP + TN) / (TP + FN + FP + TN)
else:
acc = 0
#Used for annotation metrics
# stats[key] = {'precision': precision, 'recall': recall, 'f1': f1, 'TP': TP, 'FP': FP, 'FN': FN}
# Used for token metrics
stats[key] = {'precision': precision, 'recall': recall, 'f1': f1, 'TP': TP, 'FP': FP, 'FN': FN, 'TN': TN, 'acc': acc}
metrics.calc_precision_recall_f1_acc()
return stats
return metrics
#Calculates Precision, Recall, and F1 Score for model based on input test data
#TODO: prints a whole lot to the command line, find a way to suppress?
def evaluate_orig(self, X, y, Xid):
def evaluate_orig(self, X: typing.Iterable[str], y, Xid):
try:
test_data = self.format_data(X, y)
if len(test_data) == 0 or test_data is None:

View File

@@ -13,7 +13,7 @@ import typing
import pydash
from overrides import overrides
from .pipeline import Pipeline, NerPrediction, DocumentPredictions, NerPredictionProbabilities, DocumentPredictionProbabilities
from .pipeline import Pipeline, NerPrediction, DocumentPredictions, NerPredictionProbabilities, DocumentPredictionProbabilities, EvaluationMetrics, StatMetrics
from .shared.config import ConfigBuilder
config = ConfigBuilder.get_config()
@@ -148,14 +148,14 @@ class opennlp_NER(Pipeline):
}
@overrides
def fit(self, X, y, **params) -> dict:
def fit(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **params) -> dict:
try:
data = self.format_data(X, y)
if len(data)==0 or data is None:
raise Exception("ERROR: could not format input correctly")
except:
raise Exception("ERROR: could not format input correctly")
#print(data)
logger.debug("Formated train data: %s", data)
with open(self.__train_file, 'w') as f:
f.write(data)
inputStreamFactory = self.__java_MarkableFileInputStreamFactory(self.__java_File(self.__java_String(self.__train_file)))
@@ -249,14 +249,14 @@ class opennlp_NER(Pipeline):
@overrides
# TODO
def next_example(self, X, Xid):
def next_example(self, X: typing.Iterable[str], Xid):
return
# EXTRA METHODS TO HELP WITH THE opennlp PIPELINE ##
@overrides
# models must be saved and loaded with extension ".bin"
def save_model(self, model_name):
def save_model(self, model_name: str):
if not model_name.endswith(".bin"):
logger.warning('WARNING: model_name must end with .bin, adding...')
model_name = model_name + ".bin"
@@ -266,7 +266,7 @@ class opennlp_NER(Pipeline):
@overrides
def load_model(self, model_name):
def load_model(self, model_name: str):
if not model_name.endswith(".bin"):
logger.warning('WARNING: model_name must end with .bin, adding...')
model_name = model_name + ".bin"
@@ -313,7 +313,7 @@ class opennlp_NER(Pipeline):
#Takes input data and formats it to be easier to use in the opennlp pipeline
#ASSUMES DATA FOLLOWS FORMAT X = [string], y = [[(start offset, stop offset, label), ()], ... []]
#Currently cannot assign more than one label to the same word
def format_data(self, X, y):
def format_data(self, X: typing.Iterable[str], y):
out = ''
try:
for doc, ann in zip(X, y):
@@ -373,13 +373,14 @@ class opennlp_NER(Pipeline):
labels_per_token.append(labels)
return labels_per_token
def evaluate(self, X, y, Xid):
@overrides
def evaluate(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **kwargs) -> EvaluationMetrics:
predictions = self.predict(X)
stats = {'Totals': [0, 0, 0, 0]}
metrics = EvaluationMetrics()
for (doc_id, prediction) in zip(Xid, predictions):
for (index, prediction) in enumerate(predictions):
guesses: typing.List[NerPrediction] = prediction.ner
gold = y[Xid.index(doc_id)]
gold = y[index]
all_tokens = prediction.extra_data
@@ -414,47 +415,26 @@ class opennlp_NER(Pipeline):
else:
TN.append(label)
for label in all_known_labels:
if label not in stats:
stats[label] = [0,0,0,0]
if label not in metrics.labels:
metrics.labels[label] = StatMetrics()
for label in TP:
stats[label][0] += 1
stats['Totals'][0] += 1
metrics.labels[label].tp += 1
metrics.totals.tp += 1
for label in FP:
stats[label][1] += 1
stats['Totals'][1] += 1
metrics.labels[label].fp += 1
metrics.totals.fp += 1
for label in FN:
stats[label][2] += 1
stats['Totals'][2] += 1
metrics.labels[label].fn += 1
metrics.totals.fn += 1
for label in TN:
stats[label][3] += 1
stats['Totals'][3] += 1
metrics.labels[label].tn += 1
metrics.totals.tn += 1
for key in stats:
TP = stats[key][0]
FP = stats[key][1]
FN = stats[key][2]
TN = stats[key][3]
if (TP + FN) != 0:
recall = TP / (TP + FN)
else:
recall = 1.0
if (TP + FP) != 0:
precision = TP / (TP + FP)
else:
precision = 0.0
if (precision + recall) != 0:
f1 = 2 * (precision * recall) / (precision + recall)
else:
f1 = 0
if (TP + FN + FP + TN) != 0:
acc = (TP + TN) / (TP + FN + FP + TN)
else:
acc = 0
stats[key] = {'precision': precision, 'recall': recall, 'f1': f1, 'TP': TP, 'FP': FP, 'FN': FN, "TN" : TN, "acc": acc}
metrics.calc_precision_recall_f1_acc()
return stats
return metrics
def evaluate_orig(self, X, y, Xid):
def evaluate_orig(self, X: typing.Iterable[str], y, Xid):
try:
data = self.format_data(X, y)
if len(data) == 0 or data is None:

View File

@@ -1,8 +1,66 @@
# (C) 2019 The Johns Hopkins University Applied Physics Laboratory LLC.
import abc
import logging
import typing
logger = logging.getLogger(__name__)
class StatMetrics(object):
def __init__(self, precision: float = None, recall: float = None, f1: float = None,
tp: int = 0, fp: int = 0, fn: int = 0, tn: int = 0, acc: float = None):
self.precision = precision
self.recall = recall
self.f1 = f1
self.tp = tp
self.fp = fp
self.fn = fn
self.tn = tn
self.acc = acc
def calc_precision_recall_f1_acc(self):
if (self.tp + self.fn) != 0:
self.recall = self.tp / (self.tp + self.fn)
else:
self.recall = 1.0
if (self.tp + self.fp) != 0:
self.precision = self.tp / (self.tp + self.fp)
else:
self.precision = 0.0
if (self.precision + self.recall) != 0:
self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)
else:
self.f1 = 0.0
if (self.tp + self.fn + self.fp + self.tn) != 0:
self.acc = (self.tp + self.tn) / (self.tp + self.fn + self.fp + self.tn)
else:
self.acc = 0.0
def serialize(self) -> dict:
return {"precision": self.precision, "recall": self.recall, "f1": self.f1, "TP": self.tp,
"FP": self.fp, "FN": self.fn, "TN": self.tn, "acc": self.acc}
class EvaluationMetrics(object):
def __init__(self):
self.labels: typing.Dict[str, StatMetrics] = {}
self.totals = StatMetrics()
def calc_precision_recall_f1_acc(self):
for label in self.labels:
self.labels[label].calc_precision_recall_f1_acc()
self.totals.calc_precision_recall_f1_acc()
def serialize(self) -> dict:
d = {}
for key in self.labels:
d[key] = self.labels[key].serialize()
if key == "Totals":
logging.warn("There was a label called 'Totals' that is going to be overridden.")
d["Totals"] = self.totals.serialize()
return d
class NerPrediction(object):
def __init__(self, offset_start: int, offset_end: int, label: str):
self.offset_start: int = offset_start
@@ -64,9 +122,15 @@ class Pipeline(object, metaclass=abc.ABCMeta):
# fit(X, y)
# internal state is changed
@abc.abstractmethod
def fit(self, X, y, **params) -> dict:
def fit(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **params) -> dict:
raise NotImplementedError('Must define fit to use Pipeline Base Class')
# evaluate(X, y, all_labels)
# returns stats
@abc.abstractmethod
def evaluate(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **kwargs) -> EvaluationMetrics:
raise NotImplementedError('Must define evaluate to use Pipeline Base Class')
# predict(X)
# returns [[[offset_start, offset_end, label], ..., ...]
@abc.abstractmethod
@@ -84,15 +148,15 @@ class Pipeline(object, metaclass=abc.ABCMeta):
# Given model's current state evaluate the input (id, String) pairs and return a rank ordering of lowest->highest scores for instances (will need to discuss specifics of ranking)
# Discussing rank is now a major project - see notes
@abc.abstractmethod
def next_example(self, X, Xid):
def next_example(self, X: typing.Iterable[str], Xid):
raise NotImplementedError('Must define next_example to use Pipeline Base Class')
# saves model so that it can be loaded again later
@abc.abstractmethod
def save_model(self, model_name):
def save_model(self, model_name: str):
raise NotImplementedError('Must define save_model to use Pipeline Base Class')
# loads a previously saved model
@abc.abstractmethod
def load_model(self, model_name):
def load_model(self, model_name: str):
raise NotImplementedError('Must define load_model to use Pipeline Base Class')

View File

@@ -5,7 +5,7 @@ import logging
import os
import typing
from .pipeline import Pipeline, DocumentPredictions, DocumentPredictionProbabilities
from .pipeline import Pipeline, DocumentPredictions, DocumentPredictionProbabilities, EvaluationMetrics
from overrides import overrides
@@ -20,7 +20,7 @@ class NER(Pipeline):
__lib = ''
pipeline = -1
__SUPPORTED_PIPELINES = ['spacy', 'corenlp', 'opennlp']
__SUPPORTED_PIPELINES = ['spacy', 'corenlp', 'opennlp', 'simpletransformers']
#initializes proper nlp library pipeline based on user selection
#there are additional args to accomodate initializing different pipelines, check individual pipeline for specifics
@@ -56,8 +56,8 @@ class NER(Pipeline):
#internal state is changed
#kwargs varies between pipelines, see individual pipeline for extra arguments
@overrides
def fit(self, X, y, **kwargs) -> dict:
return self.pipeline.fit(X, y, **kwargs)
def fit(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **params) -> dict:
return self.pipeline.fit(X, y, all_labels, **params)
@overrides
def predict(self, X: typing.Iterable[str]) -> typing.List[DocumentPredictions]:
@@ -68,20 +68,19 @@ class NER(Pipeline):
def predict_proba(self, X: typing.Iterable[str], **kwargs) -> typing.List[DocumentPredictionProbabilities]:
return self.pipeline.predict_proba(X, **kwargs)
# evaluate(X, y, Xid)
# returns stats
def evaluate(self, X, y, Xid, **kwargs):
return self.pipeline.evaluate(X, y, Xid, **kwargs)
@overrides
def evaluate(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **kwargs) -> EvaluationMetrics:
return self.pipeline.evaluate(X, y, all_labels, **kwargs)
#next_example(Xid)
#Given model's current state evaluate the input (id, String) pairs and return a rank ordering of lowest->highest scores for instances (will need to discuss specifics of ranking)
@overrides
def next_example(self, X, Xid):
def next_example(self, X: typing.Iterable[str], Xid):
#may want to program it here instead of one level down, as the ranking function might not change with the pipeline used
return self.pipeline.next_example(X, Xid)
@overrides
def save_model(self, model_name):
def save_model(self, model_name: str):
directory = os.path.dirname(model_name)
# if directories in path dont exists create them
if not os.path.exists(directory):
@@ -90,5 +89,5 @@ class NER(Pipeline):
return self.pipeline.save_model(model_name)
@overrides
def load_model(self, model_name):
def load_model(self, model_name: str):
self.pipeline.load_model(model_name)

View File

@@ -95,6 +95,15 @@ class BaseConfig(object):
framework="spacy",
types=["fit", "predict", "status"]
)
),
dict(
name="simpletransformers",
version="1.0",
channel="service_simpletransformers",
service=dict(
framework="simpletransformers",
types=["fit", "predict", "status"]
)
)
]

View File

@@ -0,0 +1,305 @@
#!/usr/bin/env python3
# coding: utf8
# (C) 2019 The Johns Hopkins University Applied Physics Laboratory LLC.
import logging
import os
import os.path
from shutil import copyfile
import uuid
import typing
from overrides import overrides
from .pipeline import Pipeline, NerPrediction, DocumentPredictions, NerPrediction, NerPredictionProbabilities, DocumentPredictionProbabilities, EvaluationMetrics, StatMetrics
from .shared.config import ConfigBuilder
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize.punkt import PunktSentenceTokenizer
import numpy as np
import pandas as pd
from simpletransformers.ner import NERModel, NERArgs
config = ConfigBuilder.get_config()
logger = logging.getLogger(__name__)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
# TODO: Change the collections.json file to default the collection for simple transformers with real classifiers, etc
class simpletransformers_NER(Pipeline):
def __init__(self, tmp_dir=None):
self.__id = uuid.uuid4()
if tmp_dir != None:
self.__temp_dir = tmp_dir
#can choose to dictate where the model will store files so that it doesn't overwrite any,
#otherwise it will write to a new directory within the resources folder
else:
self.__temp_dir = config.ROOT_DIR + '/tmp/simpletransformers-' + str(self.__id)
self.__model_dir = os.path.join(self.__temp_dir, "OUTPUT_MODEL/")
self.__default_model_args = {
# TODO: Some of these should be args passed in with defaults, probably epoch size, the dirs, any others Brant might say
# TODO: There is a runs/ directory that is default created in the current directory where this is ran (pipelines/),
# there might be an option to change that, or maybe just add to gitignore?
"output_dir": self.__model_dir,
"cache_dir": os.path.join(self.__temp_dir, "CACHE_DIR/"),
"tensorboard_dir": os.path.join(self.__temp_dir, "TENSORBOARD/"),
"max_seq_length": 128,
"train_batch_size": 16,
"gradient_accumulation_steps": 1,
"eval_batch_size": 8,
"num_train_epochs": 1,
"weight_decay": 0,
"learning_rate": 4e-5,
"adam_epsilon": 1e-8,
"warmup_ratio": 0.06,
"warmup_steps": 20,
"max_grad_norm": 1.0,
"logging_steps": 50,
"save_steps": 500,
"overwrite_output_dir": True,
"reprocess_input_data": False,
"evaluate_during_training": False,
}
# TODO: Switch back to bioclinical bert, and also adding this as an option to change.
# All models we can use: https://huggingface.co/models
# self.__model_name = "emilyalsentzer/Bio_ClinicalBERT"
# This currently being used because it is faster.
self.__model_type = "bert"
self.__model_name = "google/mobilebert-uncased"
self.__model_use_cuda = False
self.__model = None
self.__sentence_tokenizer = PunktSentenceTokenizer()
self.__word_tokenizer = WhitespaceTokenizer()
# status()
@overrides
def status(self) -> dict:
return {
"default_model_args": self.__default_model_args
}
# fit(X, y)
# internal state is changed
@overrides
def fit(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **params) -> dict:
# setting up params
model_args = self.__default_model_args.copy()
if params is not None:
for key in model_args.keys():
if key in params:
model_args[key]= params[key]
logger.info("Training with parameters: {}".format(model_args))
# First, need to set up the data into a pandas dataframe and format our labels
df = self._format_data(X, y)
labels = self._format_labels(all_labels)
# Create a new model, needs to be here for now since this is where we get labels
if not self.__model:
self.__model = NERModel(self.__model_type, self.__model_name, labels=labels,
use_cuda=self.__model_use_cuda, args=model_args)
# After this, the model should be trained, and output files created
self.__model.train_model(df, verbose=False, silent=True,
show_running_loss=False)
return {}
@overrides
def evaluate(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **kwargs) -> EvaluationMetrics:
if not self.__model:
raise Exception("Can't evaluate until model has been trained or loaded")
# First, need to set up the data into a pandas dataframe and format our labels
df = self._format_data(X, y)
# No need to recreate model, as this is only run after fit().
# Evaluate.
result, model_outputs, preds_list = self.__model.eval_model(
df, verbose=False)
# acc=sklearn.metrics.accuracy_score
logger.info("Evaluated model, result={}".format(result))
metrics = EvaluationMetrics()
metrics.totals.precision = result["precision"]
metrics.totals.recall = result["recall"]
metrics.totals.f1 = result["f1_score"]
# TODO: need acc
# TODO: need metrics for each label
return metrics
# predict(X)
@overrides
def predict(self, X: typing.Iterable[str]) -> typing.List[DocumentPredictions]:
# First, make sure this model has been trained
if not self.__model:
return None
# Make predictions with the model
return_preds = []
for doc in X:
data = [s for s in self._sentencize(doc)]
predictions, _ = self.__model.predict([sentence for (_, _, sentence) in data])
return_preds.append(self._format_prediction(data, predictions))
return return_preds
# predict_proba(X)
# can also return scores for all labels if get_all is True
@overrides
def predict_proba(self, X: typing.Iterable[str], **kwargs) -> typing.List[DocumentPredictionProbabilities]:
# TODO: Need to implement this.
# The "raw_outputs" (second item in tuple returned from predict) is probably useful for this.
# Can turn predictions into probabilities for each label by running:
# Where the array passed in refers to each word (print raw_outputs in the expanded_ner.py file to see this)
# a = np.asarray([-0.2597193, 0.3929489, 0.42044127, 0.65579444, -0.075302914, 0.0072728638, 0.11236907, -0.035289638, -0.09346388, -0.25901815, -0.16599336, -0.06283752, -0.2664347])
# prob = softmax(a)
# prob is then equal to: array([0.0552652 , 0.10614558, 0.10910426, 0.13805568, 0.06645731,
# 0.07217802, 0.0801766 , 0.0691704 , 0.06526127, 0.05530396,
# 0.06069549, 0.06729091, 0.05489531]) which look like the probabilities of the labels (there are the same number of elements as labels)
# It probably refers to the order of the labels given in, so if the labels arg was ['B-geo', 'I-geo'...] then
# B-geo is probably 0.0552652 and I-geo is probably 0.10614558... etc
return []
# next_example(X, Xid)
# Given model's current state evaluate the input (id, String) pairs and return a rank ordering of lowest->highest scores for instances (will need to discuss specifics of ranking)
# Discussing rank is now a major project - see notes
@overrides
def next_example(self, X: typing.Iterable[str], Xid):
# Don't think we needed to do anything with this.
return None
# saves model so that it can be loaded again later
@overrides
def save_model(self, model_name: str):
# Save all files from the output dir to the desired spot in order to load
os.mkdir(model_name)
# Copy from the tmp directory - but not the checkpoints
for filename in os.listdir(self.__model_dir):
if "checkpoint" not in filename:
copyfile(os.path.join(self.__model_dir, filename), os.path.join(model_name, filename))
return model_name
# loads a previously saved model
@overrides
def load_model(self, model_name: str):
# Loading from model requires creating the model from the saved directory
# This "model_name" is just the path, it doesn't refer to the name like before
self.__model = NERModel(self.__model_type, model_name,
use_cuda=self.__model_use_cuda, args=self.__default_model_args)
###############################
# Helper Methods
###############################
def _get_word_label(self, start_index, end_index, label_list):
# Takes in the indices of a word and label list to return a related tag (if possible)
# This will account for the I-<label> or B-<label> that simpletransformers expects
for label_group in label_list:
# This works because the word either begins a multi-word label or the label only covers a single word
if label_group[0] == start_index:
return "B-" + label_group[2]
# This is at least the second word in a multi-word label
# <= because == works on the last word, > is for any word that appears BETWEEN the first and last words
elif label_group[0] < start_index and label_group[1] >= end_index:
return "I-" + label_group[2]
# Assuming y is always sorted, this ends the loop if there is no label at this index early to save time
elif end_index < label_group[0]:
break
# If it got here, the label was not found
return "O"
def _sentencize(self, text: str) -> typing.Generator[int, int, str]:
for (sentence_start, sentence_end) in self.__sentence_tokenizer.span_tokenize(text):
yield (sentence_start, sentence_end, text[sentence_start:sentence_end])
# Takes input data and formats it to be easier to use in the spacy pipeline
# ASSUMES DATA FOLLOWS FORMAT X = [string], y = [[(start offset, stop offset, label), ()], ... []]
# Simpletransformers needs a pandas dataframe with columns: sentence_id, words, labels
def _format_data(self, X: typing.Iterable[str], y) -> pd.DataFrame:
# TODO: Need to check to make sure no sentence has over max_seq_length words
df = pd.DataFrame(columns=["sentence_id","words","labels"])
curr_sentence_id = 0
for (doc_txt, labels) in zip(X, y):
for (sentence_start, _, sentence) in self._sentencize(doc_txt):
for (sentence_word_start, sentence_word_end) in self.__word_tokenizer.span_tokenize(sentence):
word_start = sentence_start + sentence_word_start
word_end = sentence_start + sentence_word_end
word = doc_txt[word_start:word_end]
curr_label = self._get_word_label(word_start, word_end, labels)
df = df.append({
"sentence_id": curr_sentence_id,
"words": word,
"labels": curr_label
}, ignore_index=True)
curr_sentence_id += 1
return df
# Takes the prediction output of simpletransformers ([[{'U.N.': 'B-per'}], [{'relief': 'I-gpe'}], ...])
# and turns it into the form PINE desires, [[[offset_start, offset_end, label], ..., ...]
def _format_prediction(self, data, predictions) -> DocumentPredictions:
ner: typing.List[NerPrediction] = []
for (index, sentence_predictions) in enumerate(predictions):
sentence_start, _, sentence = data[index]
current_label = None
current_label_start = None
current_label_end = None
word_index = 0
sentence_ner: typing.List[NerPrediction] = []
for pred_dict in sentence_predictions:
for (word, label) in pred_dict.items():
word_index = sentence.find(word, word_index)
if label == "O":
if current_label != None:
sentence_ner.append(NerPrediction(current_label_start, current_label_end, current_label))
current_label = current_label_start = current_label_end = None
continue
is_b = label.startswith("B-")
is_i = label.startswith("I-")
if is_b or is_i:
label = label[2:]
# if we're at the beginning, we always add the old tag
# if we're at an inner and it's different from the current label, add the old tag
if current_label != None and (is_b or (is_i and label != current_label)):
sentence_ner.append(NerPrediction(current_label_start, current_label_end, current_label))
current_label = current_label_start = current_label_end = None
if current_label != None: # continuing the label
current_label_end = sentence_start + word_index + len(word)
else: # new label
current_label = label
current_label_start = sentence_start + word_index
current_label_end = sentence_start + word_index + len(word)
# the last label
if current_label != None:
sentence_ner.append(NerPrediction(current_label_start, current_label_end, current_label))
ner += sentence_ner
return DocumentPredictions(ner, [])
# Get a list of all labels in a set of data
def _format_labels(self, all_labels: typing.List[str]):
# Have to add a B-<label> and I-<label> for each label.
ret_labels = []
for label in all_labels:
ret_labels.append("B-" + str(label))
ret_labels.append("I-" + str(label))
# Add the other tag
ret_labels.append("O")
return ret_labels

View File

@@ -19,7 +19,7 @@ from spacy.scorer import Scorer
from spacy.gold import GoldParse
from overrides import overrides
from .pipeline import Pipeline, NerPrediction, DocumentPredictions, NerPredictionProbabilities, DocumentPredictionProbabilities
from .pipeline import Pipeline, NerPrediction, DocumentPredictions, NerPredictionProbabilities, DocumentPredictionProbabilities, EvaluationMetrics, StatMetrics
logger = logging.getLogger(__name__)
@@ -74,7 +74,7 @@ class spacy_NER(Pipeline):
}
@overrides
def fit(self, X, y, **params) -> dict:
def fit(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **params) -> dict:
#setting up params
default_params = self.__default_fit_params.copy()
if params is not None:
@@ -118,25 +118,24 @@ class spacy_NER(Pipeline):
"losses": all_losses
}
def evaluate(self, X, y, Xid):
@overrides
def evaluate(self, X: typing.Iterable[str], y, all_labels: typing.Iterable[str], **kwargs) -> EvaluationMetrics:
train_data = self.format_data(X, y)
all_labels = set()
metrics = dict()
metrics = EvaluationMetrics()
# get all labels
for text, annot in train_data:
for ent in annot['entities']:
all_labels.add(ent[2])
all_labels = list(all_labels)
stats = {}
for text, annots in train_data:
pred_doc = self.__nlp(text)
gold_doc = self.__nlp.make_doc(text)
gold_labels = []
stats['Totals'] = [0,0,0,0]
for label in all_labels:
stats[label] = [0,0,0,0]
metrics.labels[label] = StatMetrics()
for token in pred_doc:
gold_labels.append(set())
@@ -149,11 +148,9 @@ class spacy_NER(Pipeline):
goldParse = GoldParse(gold_doc, entities=annotations_for_label)
for index, annotation in enumerate(goldParse.ner):
if annotation != 'O':
gold_labels[index].add(annotation[2:])
for index, pred_token in enumerate(pred_doc):
pred_label = pred_token.ent_type_
if pred_label != '':
@@ -161,56 +158,33 @@ class spacy_NER(Pipeline):
if label == pred_label:
if label in gold_labels[index]:
#TP
stats[label][0] += 1
stats['Totals'][0] += 1
metrics.labels[label].tp += 1
metrics.totals.tp += 1
else:
#FP
stats[label][1] += 1
stats['Totals'][1] += 1
metrics.labels[label].fp += 1
metrics.totals.fp += 1
else:
#All other labels are true negative because the model can only predict one label per token
#TN
stats[label][3] += 1
stats['Totals'][3] += 1
metrics.labels[label].tn += 1
metrics.totals.tn += 1
else:
for label in all_labels:
if label in gold_labels[index]:
#FN
stats[label][2] += 1
stats['Totals'][2] += 1
metrics.labels[label].fn += 1
metrics.totals.fn += 1
else:
#TN
stats[label][3] += 1
stats['Totals'][3] += 1
metrics.labels[label].tn += 1
metrics.totals.tn += 1
for key in stats:
TP = stats[key][0]
FP = stats[key][1]
FN = stats[key][2]
TN = stats[key][3]
if (TP + FN) != 0:
recall = TP / (TP + FN)
else:
recall = 1.0
if (TP + FP) != 0:
precision = TP / (TP + FP)
else:
precision = 0.0
if (precision + recall) != 0:
f1 = 2 * (precision * recall) / (precision + recall)
else:
f1 = 0
if (TP + FN + FP + TN) != 0:
acc = (TP + TN) / (TP + FN + FP + TN)
else:
acc = 0
metrics[key] = {'precision': precision, 'recall': recall, 'f1': f1, 'TP': TP, 'FP': FP, 'FN': FN, "TN": TN,
"acc": acc}
metrics.calc_precision_recall_f1_acc()
return metrics
@@ -356,14 +330,14 @@ class spacy_NER(Pipeline):
@overrides
# TODO
def next_example(self, X, Xid):
def next_example(self, X: typing.Iterable[str], Xid):
return
## EXTRA METHODS TO HELP WITH THE SPACY PIPELINE ##
# Takes input data and formats it to be easier to use in the spacy pipeline
# ASSUMES DATA FOLLOWS FORMAT X = [string], y = [[(start offset, stop offset, label), ()], ... []]
def format_data(self, X, y):
def format_data(self, X: typing.Iterable[str], y):
out = []
for i, text in enumerate(X):
out.append((text, {'entities': [(labels) for labels in y[i]]}))
@@ -374,11 +348,11 @@ class spacy_NER(Pipeline):
self.__ner.add_label(entity)
@overrides
def save_model(self, model_name):
def save_model(self, model_name: str):
self.__nlp.to_disk(model_name)
logger.info('Saved model to ' + model_name)
return model_name
@overrides
def load_model(self, model_name):
def load_model(self, model_name: str):
self._load_model(model_path=model_name)

View File

@@ -161,6 +161,9 @@ def main():
if not os.path.isfile(redis_start):
lock_print("Couldn't find redis start script: {}.".format(redis_start))
return 1
pipeline_dir = os.path.join(DIR, "pipelines")
pipeline_start = os.path.join(pipeline_dir, "dev_run.sh")
backend_dir = os.path.join(DIR, "backend")
backend_start = os.path.join(backend_dir, "dev_run.sh")
@@ -179,24 +182,21 @@ def main():
if docker:
frontend_annotation_start = [frontend_annotation_start, "--", "--host", "0.0.0.0"]
pipeline_dir = os.path.join(DIR, "pipelines")
pipeline_start = os.path.join(pipeline_dir, "dev_run.sh")
eve_process = start_eve_process(eve_dir, eve_start)
if not eve_only:
redis_process = start_redis_process(redis_dir, redis_start)
pipeline_process = start_pipeline(pipeline_dir, pipeline_start)
backend_process = start_backend_process(backend_dir, backend_start)
if not eve_only and not backend_only:
frontend_annotation_process = start_frontend_annotation_process(frontend_annotation_dir, frontend_annotation_start)
pipeline_process = start_pipeline(pipeline_dir, pipeline_start)
def signal_handler(sig, frame):
lock_print("")
if not eve_only and not backend_only:
stop_pipeline(pipeline_process)
stop_frontend_annotation_process(frontend_annotation_process)
if not eve_only:
stop_backend_process(backend_process)
stop_pipeline(pipeline_process)
stop_redis_process(redis_process)
stop_eve_process(eve_process)
lock_print("")

View File

@@ -39,7 +39,7 @@
"labels": ["geo", "gpe", "per", "org", "tim", "art"],
"metadata": {
"title": "Small Collection",
"description": "This is a small collection"
"description": "This is a small collection using spaCy pipeline"
},
"archived": false,
"configuration": {
@@ -156,5 +156,35 @@
"text_column": 0
}
}
},
{
"collection": {
"creator_id": "ada",
"annotators": ["ada"],
"viewers": ["ada"],
"labels": ["geo", "gpe", "per", "org", "tim", "art"],
"metadata": {
"title": "Small Collection Simpletransformers",
"description": "This is a small collection using Simpletransformers pipeline"
},
"archived": false,
"configuration": {
"allow_overlapping_ner_annotations": false
}
}, "classifier": {
"pipelineId": "5babb6ee4eb7dd2c39b96720",
"overlap": 0,
"train_every": 5,
"classifierParameters": {
"cutoff": 1,
"iterations": 5
}
}, "documents": {
"ner_annotations": {
"csv_file": "./ner_dataset.csv",
"sentences_per_doc": 5
},
"num_docs": 5
}
}
]

View File

@@ -40,5 +40,15 @@
"use_type_seqs2": [true, false],
"use_type_y_sequences": [true, false]
}
},
{
"_id": "5babb6ee4eb7dd2c39b96720",
"title": "SimpleTransformers - Bio-ClinicalBERT",
"description": "SimpleTransformers models.",
"name": "simpletransformers",
"parameters": {
"training_batch_size": "integer",
"num_train_epochs": "integer"
}
}
]

View File

@@ -239,7 +239,18 @@ def test_train_and_predict_opennlp():
[972, 976, 'gpe'], [1025, 1029, 'gpe'], [1089, 1096, 'geo'], [1113, 1120, 'gpe'],
[1200, 1209, 'tim'], [1221, 1225, 'org']]
def test_sync_train():
def test_train_and_predict_simpletransformers():
prediction = _test_train_and_predict("Small Collection Simpletransformers")
assert len(prediction["doc"]) == 0
preds = prediction["ner"]
# unfortunately the simpletransformers predictions are not the same across runs
# and there don't seem to be guaranteed common tokens
# so just make sure any predictions have proper labels...
common_labels = {'gpe', 'org', 'geo', 'tim', 'per'}
for pred in preds:
assert pred[2] in common_labels
def test_sync_train():
client = common.login_with_test_user(common.client())
collection = common.get_collection(client, "Small Collection OpenNLP")