mirror of
https://github.com/JHUAPL/PINE.git
synced 2026-01-09 20:47:59 -05:00
411 lines
16 KiB
Python
Executable File
411 lines
16 KiB
Python
Executable File
# -*- coding: utf-8 -*-
|
|
# (C) 2019 The Johns Hopkins University Applied Physics Laboratory LLC.
|
|
|
|
"""
|
|
eve-demo-client
|
|
~~~~~~~~~~~~~~~
|
|
Simple and quickly hacked together, this script is used to reset the
|
|
eve-demo API to its initial state. It will use standard API calls to:
|
|
1) delete all items in the 'people' and 'works' collections
|
|
2) post multiple items in both collection
|
|
I guess it can also serve as a basic example of how to programmatically
|
|
manage a remote API using the phenomenal Requests library by Kenneth Reitz
|
|
(a very basic 'get' function is included even if not used).
|
|
:copyright: (c) 2015 by Nicola Iarocci.
|
|
:license: BSD, see LICENSE for more details.
|
|
"""
|
|
import sys
|
|
import json
|
|
import requests
|
|
import csv
|
|
import pprint
|
|
import time
|
|
import random
|
|
import os
|
|
|
|
from sklearn.datasets import fetch_20newsgroups
|
|
from random import randrange
|
|
from pymongo import MongoClient
|
|
|
|
if os.environ.get("FLASK_PORT"):
|
|
FLASK_PORT = int(os.environ.get("FLASK_PORT"))
|
|
else:
|
|
FLASK_PORT = 5000
|
|
|
|
if os.environ.get("MONGO_PORT"):
|
|
MONGO_PORT = int(os.environ.get("MONGO_PORT"))
|
|
else:
|
|
MONGO_PORT = 27017
|
|
|
|
ENTRY_POINT = '127.0.0.1:{}'.format(FLASK_PORT)
|
|
OVERLAP = .15
|
|
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
|
|
data = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
|
|
#data.data, data.target
|
|
|
|
def create_collection(userid, labels):
|
|
collection = [{
|
|
'creator_id': userid,
|
|
'annotators': [userid],
|
|
'viewers': [userid],
|
|
'labels':labels,
|
|
'metadata': {'title':'Trial Collection', 'description':'This is a sample description of a collection'},
|
|
'archived': False,
|
|
'configuration': {
|
|
'allow_overlapping_ner_annotations': True
|
|
}
|
|
}]
|
|
r = perform_post('collections', json.dumps(collection))
|
|
return get_ids(r)
|
|
|
|
def create_documents(collection_id, user_id, num_docs):
|
|
data = fetch_20newsgroups(subset='test', categories=categories, shuffle=False)
|
|
|
|
docs = []
|
|
for i in range(num_docs):
|
|
docs.append({
|
|
'creator_id': user_id,
|
|
'collection_id': collection_id,
|
|
'overlap': 0,
|
|
'text': data.data[i]
|
|
})
|
|
r = perform_post('documents', json.dumps(docs))
|
|
print('Created:', len(docs), 'documents')
|
|
return get_ids(r)
|
|
|
|
def create_annotations(user_id, collection_id, doc_ids, categories):
|
|
data = fetch_20newsgroups(subset='test', categories=categories, shuffle=False)
|
|
|
|
annotations = []
|
|
for i, doc_id in enumerate(doc_ids):
|
|
annotations.append({
|
|
'creator_id': user_id,
|
|
'collection_id': collection_id,
|
|
'document_id': doc_id,
|
|
'annotation': [categories[data.target[i]]]
|
|
})
|
|
r = perform_post('annotations', json.dumps(annotations))
|
|
print('Created:', len(annotations), 'annotations')
|
|
return get_ids(r)
|
|
|
|
def update_annotations(annotation_ids):
|
|
#perform_get('annotations?where={"document_id": {"$in": ["5b3531a9aec9104c8a9aca9e", "5b3531a9aec9104c8a9acaa0"]}}'
|
|
for id in annotation_ids:
|
|
url = 'http://'+ENTRY_POINT+'/annotations/'+id
|
|
response = requests.get(url, headers={'Content-Type': 'application/json'})
|
|
if response.status_code == 200:
|
|
r = response.json()
|
|
etag = r['_etag']
|
|
headers = {'Content-Type': 'application/json', 'If-Match': etag}
|
|
data = {'annotation': ['soc.religion.christian']}
|
|
requests.patch('http://'+ENTRY_POINT+'/annotations/' + id, json.dumps(data), headers=headers)
|
|
|
|
def create_pipeline():
|
|
pipeline = [
|
|
{
|
|
"_id": "5babb6ee4eb7dd2c39b9671c",
|
|
"title": "Apache OpenNLP Named Entity Recognition",
|
|
"description": "Apache's open source natural language processing toolkit for named entity recognition (NER). See https://opennlp.apache.org/ for more information. This is the default pipeline used for NER.",
|
|
"name":"opennlp",
|
|
"parameters":{
|
|
"cutoff":"integer",
|
|
"iterations":"integer"
|
|
}
|
|
},
|
|
{
|
|
"_id": "5babb6ee4eb7dd2c39b9671d",
|
|
"title": "SpaCy Named Entity Recognition",
|
|
"description": "spaCy is a free open-source library for Natural Language Processing in Python. It features NER, POS tagging, dependency parsing, word vectors and more",
|
|
"name": "spaCy",
|
|
"parameters": {
|
|
"n_iter": "integer",
|
|
"dropout": "float"
|
|
}
|
|
},
|
|
{
|
|
"_id": "5babb6ee4eb7dd2c39b9671f",
|
|
"title": "Stanford CoreNLP Named Entity Recognition",
|
|
"description": "Stanford's natural language processing toolkit for named entity recognition (NER). See https://stanfordnlp.github.io/CoreNLP/ for more information.",
|
|
"name":"corenlp",
|
|
"parameters": {
|
|
"max_left":"integer",
|
|
"use_class_feature": [True, False],
|
|
"use_word": [True, False],
|
|
"use_ngrams": [True, False],
|
|
"no_mid_ngrams": [True, False],
|
|
"max_ngram_length":"integer",
|
|
"use_prev": [True, False],
|
|
"use_next": [True, False],
|
|
"use_disjunctive": [True, False],
|
|
"use_sequences": [True, False],
|
|
"use_prev_sequences": [True, False],
|
|
"use_type_seqs": [True, False],
|
|
"use_type_seqs2": [True, False],
|
|
"use_type_y_sequences": [True, False]
|
|
}
|
|
}
|
|
]
|
|
r = perform_post('pipelines', json.dumps(pipeline))
|
|
return get_ids(r)
|
|
|
|
def create_user():
|
|
users = []
|
|
users.append(
|
|
{
|
|
'_id':'bchee1',
|
|
'firstname': "Brant",
|
|
'lastname': 'Chee',
|
|
'email': 'bchee1@jhmi.edu',
|
|
'description': "Brant Developer",
|
|
'role': ['user']
|
|
})
|
|
users.append({
|
|
'_id': 'bchee2',
|
|
'firstname': "Brant",
|
|
'lastname': 'Chee',
|
|
'email': 'bchee2@jhmi.edu',
|
|
'description': "Brant administrator",
|
|
'role': ['administrator']
|
|
})
|
|
users.append(
|
|
{
|
|
'_id': 'lglende1',
|
|
'firstname': "Laura",
|
|
'lastname': 'Glendenning',
|
|
'email': 'lglende1@jh.edu',
|
|
'description': "Developer Laura",
|
|
'role': ['user']
|
|
})
|
|
users.append(
|
|
{
|
|
'_id': 'cahn9',
|
|
'firstname': "Charles",
|
|
'lastname': 'Ahn',
|
|
'email': 'cahn9@jh.edu',
|
|
'description': "Developer Charles",
|
|
'role': ['user']
|
|
})
|
|
r = perform_post('users', json.dumps(users))
|
|
return get_ids(r)
|
|
|
|
def create_classifier(collection_id, overlap, pipeline_id, labels):
|
|
'''{
|
|
'collection_id': {'type': 'objectid', 'required': True},
|
|
'overlap': {'type': 'float', 'required': True},
|
|
'pipeline_id': {'type': 'objectid', 'required': True},
|
|
'parameters': {'type': 'dict'}'''
|
|
classifier_obj = {'collection_id':collection_id,
|
|
'overlap':overlap,
|
|
'pipeline_id':pipeline_id,
|
|
'parameters':{"cutoff":1, "iterations":100},
|
|
'labels':labels
|
|
}
|
|
r = perform_post('classifiers', json.dumps(classifier_obj))
|
|
return get_ids(r)
|
|
|
|
def create_metrics(collection_id, classifier_id):
|
|
# create metrics for classifier
|
|
metrics_obj = {"collection_id": collection_id,
|
|
"classifier_id": classifier_id,
|
|
"documents": list(),
|
|
"annotations": list()
|
|
}
|
|
metrics_resp = perform_post("metrics", json.dumps(metrics_obj))
|
|
return get_ids(metrics_resp)
|
|
|
|
def create_next_ids(classifier_id, ann_ids, docids, overlap):
|
|
num_overlap = int(len(docids) * overlap)
|
|
#we're lazy and taking to first n docs as overlap
|
|
#'classifier_id': {'type': 'objectid', 'required': True},
|
|
#'document_ids': {'type': 'list', 'required': True},
|
|
#'overlap_document_ids': {'type': 'dict', 'required': True}
|
|
overlap_obj = {'classifier_id':classifier_id, 'document_ids':docids[num_overlap:], 'overlap_document_ids':{}}
|
|
for id in ann_ids:
|
|
overlap_obj['overlap_document_ids'][id] = docids[0:num_overlap]
|
|
r = perform_post('next_instances', json.dumps(overlap_obj))
|
|
return get_ids(r)
|
|
|
|
def get_ids(response):
|
|
valids = []
|
|
#print("Response:", response)
|
|
if response.status_code == 201:
|
|
r = response.json()
|
|
if r['_status'] == 'OK':
|
|
if '_items' in r:
|
|
for obj in r['_items']:
|
|
if obj['_status'] == "OK":
|
|
valids.append(obj['_id'])
|
|
else:
|
|
valids.append(r['_id'])
|
|
return valids
|
|
|
|
def perform_get(resource, data):
|
|
headers = {'Content-Type': 'application/json'}
|
|
return requests.get(endpoint(resource), data, headers=headers)
|
|
|
|
def perform_post(resource, data):
|
|
headers = {'Content-Type': 'application/json'}
|
|
return requests.post(endpoint(resource), data, headers=headers)
|
|
|
|
def endpoint(resource):
|
|
url = 'http://%s/%s/' % (
|
|
ENTRY_POINT if not sys.argv[1:] else sys.argv[1], resource)
|
|
return url
|
|
|
|
def delete_database(mongourl, database):
|
|
client = MongoClient(mongourl)
|
|
client.drop_database(database)
|
|
|
|
def create_bionlp_annotations(bionlpfile, num_docs, pipeline_id, creator_id, annotator_ids):
|
|
docs, anns, stats = load_bionlp(bionlpfile, num_docs)
|
|
categories = list(stats.keys())
|
|
|
|
#Create collection
|
|
collection = [{
|
|
'creator_id': creator_id,
|
|
'annotators': annotator_ids,
|
|
'viewers': annotator_ids,
|
|
'labels':categories,
|
|
'metadata': {'title':'NER Test Collection', 'description':'This is a sample sample collection to test NER tasks'},
|
|
'archived': False,
|
|
'configuration': {
|
|
'allow_overlapping_ner_annotations': True
|
|
}
|
|
}]
|
|
r = perform_post('collections', json.dumps(collection))
|
|
collection_id = get_ids(r)
|
|
collection_id = collection_id[0]
|
|
print("collection_id", collection_id)
|
|
|
|
#Create documents
|
|
images = [
|
|
'https://upload.wikimedia.org/wikipedia/commons/thumb/0/08/Unequalized_Hawkes_Bay_NZ.jpg/600px-Unequalized_Hawkes_Bay_NZ.jpg',
|
|
'https://cdn.indreams.me/cdf00b6d4827cd66511bdc35e1ef2ea3_10',
|
|
'/static/apl.png',
|
|
'/static/campus.jpg'
|
|
]
|
|
upload = []
|
|
for i in range(len(docs)):
|
|
upload.append({
|
|
'creator_id': creator_id,
|
|
'collection_id': collection_id,
|
|
'overlap': 0,
|
|
'text': docs[i],
|
|
'metadata': { 'imageUrl': images[randrange(0,len(images))] }
|
|
})
|
|
r = perform_post('documents', json.dumps(upload))
|
|
print('Created:', len(upload), 'documents')
|
|
doc_ids = get_ids(r)
|
|
|
|
classifier_ids = create_classifier(collection_id, 0, pipeline_id, categories)
|
|
print('Classifier id', classifier_ids)
|
|
metrics_ids = create_metrics(collection_id, classifier_ids[0])
|
|
print('Metrics id', metrics_ids)
|
|
next_ids = create_next_ids(classifier_ids[0], annotator_ids, doc_ids, 0)
|
|
annotations = []
|
|
|
|
for i, doc_id in enumerate(doc_ids):
|
|
annotations.append({
|
|
'creator_id': annotator_ids[random.randrange(0, len(annotator_ids))],
|
|
'collection_id': collection_id,
|
|
'document_id': doc_id,
|
|
'annotation': anns[i]
|
|
})
|
|
r = perform_post('annotations', json.dumps(annotations))
|
|
print('Created:', len(annotations), 'annotations')
|
|
return get_ids(r)
|
|
|
|
def load_bionlp(csvname, limit):
|
|
docs = []
|
|
anns = []
|
|
stats = {}
|
|
sentences_per_doc = 10 # total of 47959 sentences
|
|
with open(csvname, 'r', encoding='utf-8', errors='ignore') as csv_file:
|
|
csv_reader = csv.reader(csv_file, delimiter=',')
|
|
line_count = 0
|
|
doc_text = ''
|
|
doc_anns = []
|
|
sentence_id = 1
|
|
next(csv_reader)
|
|
for line in csv_reader:
|
|
if line[0] != '':
|
|
sentence_id = int(line[0].split(':')[1])
|
|
# avoids triggering on the first sentence
|
|
if sentence_id % sentences_per_doc == 1 and sentence_id >= sentences_per_doc: # once you have enough sentences per doc, append to list and clear doc_text/doc_anns
|
|
# print('Added case ' + cases[-1])
|
|
docs.append(doc_text)
|
|
doc_text = ''
|
|
anns.append(doc_anns)
|
|
doc_anns = []
|
|
if len(docs) > limit-2:
|
|
break
|
|
token = line[1]
|
|
# add token to text and record start/end char
|
|
start_char = len(doc_text)
|
|
doc_text += token
|
|
end_char = len(doc_text)
|
|
doc_text += ' '
|
|
|
|
if line[3] != 'O': # if label is not 'O'
|
|
label = line[3].split('-')[1] # has BILUO tages that we don't need ex. 'B-tag'
|
|
if label not in stats:
|
|
stats[label] = 0
|
|
if line[3].split('-')[0] == 'B':
|
|
stats[label] += 1 # only add if the label has the 'begin' tag otherwise labels spanning multiple tokens are added multiple times
|
|
doc_anns.append((start_char, end_char, label)) # add label to annotations
|
|
elif line[3].split('-')[0] == 'I':
|
|
# NOTE: assumes I-tags only ever follow B-tags, will break if not the case
|
|
doc_anns.append((doc_anns[-1][0], end_char,
|
|
label)) # if the label spans multiple tokens update the most recent annotation with the new end char
|
|
del doc_anns[-2]
|
|
line_count += 1
|
|
# add remaining sentence
|
|
docs.append(doc_text)
|
|
doc_text = ''
|
|
anns.append(doc_anns)
|
|
doc_anns = []
|
|
return docs, anns, stats
|
|
|
|
if __name__ == '__main__':
|
|
mongourl = 'mongodb://localhost:{}'.format(MONGO_PORT)
|
|
delete_database(mongourl, 'test') # old database
|
|
delete_database(mongourl, 'pmap_nlp')
|
|
|
|
#generate new data
|
|
user_ids = create_user()
|
|
pipeline_ids = create_pipeline()
|
|
collection_id = create_collection(user_ids[1], categories)
|
|
doc_ids = create_documents(collection_id[0], user_ids[1], 750)
|
|
classifier_ids = create_classifier(collection_id[0], OVERLAP, pipeline_ids[0], categories)
|
|
metrics_ids = create_metrics(classifier_ids[0],collection_id[0])
|
|
next_ids = create_next_ids(classifier_ids[0], [user_ids[1]], doc_ids, OVERLAP)
|
|
annotation_ids = create_annotations(user_ids[0], collection_id[0], doc_ids[int(len(doc_ids)/2):], categories)
|
|
print("collection_id=",collection_id[0])
|
|
print("classifier_id='", classifier_ids[0], "'")
|
|
print("metrics_id='", metrics_ids[0], "'")
|
|
#update_annotations(annotation_ids[int(len(annotation_ids)/2):])
|
|
|
|
collection_id2 = create_collection(user_ids[1], categories)
|
|
doc_ids2 = create_documents(collection_id2[0], user_ids[1], 500)
|
|
annotation_ids2 = create_annotations(user_ids[0], collection_id[0], doc_ids2 , categories)
|
|
update_annotations(annotation_ids2[0:int(len(annotation_ids2) / 2)])
|
|
|
|
print('user_ids=',user_ids)
|
|
print('pipeline_ids=',pipeline_ids)
|
|
print('collection_id=',collection_id)
|
|
print('doc_ids=',doc_ids)
|
|
print('classifier_ids=',classifier_ids)
|
|
print('next_ids=',next_ids)
|
|
print('annotation_ids1=', annotation_ids)
|
|
print('annotation_ids2=',annotation_ids2)
|
|
print('collection_id2=', collection_id2)
|
|
print('doc_ids2=', doc_ids2)
|
|
|
|
ner_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'ner_dataset.csv')
|
|
|
|
#user_ids = create_user()
|
|
#pipeline_ids = create_pipeline()
|
|
#print('user_ids', user_ids)
|
|
#print('pipeline_ids', pipeline_ids)
|
|
create_bionlp_annotations(ner_file, 150, pipeline_ids[1], user_ids[0], user_ids)
|