Files
PINE/eve/test/EveClient.py
2020-06-19 13:19:50 -04:00

411 lines
16 KiB
Python
Executable File

# -*- coding: utf-8 -*-
# (C) 2019 The Johns Hopkins University Applied Physics Laboratory LLC.
"""
eve-demo-client
~~~~~~~~~~~~~~~
Simple and quickly hacked together, this script is used to reset the
eve-demo API to its initial state. It will use standard API calls to:
1) delete all items in the 'people' and 'works' collections
2) post multiple items in both collection
I guess it can also serve as a basic example of how to programmatically
manage a remote API using the phenomenal Requests library by Kenneth Reitz
(a very basic 'get' function is included even if not used).
:copyright: (c) 2015 by Nicola Iarocci.
:license: BSD, see LICENSE for more details.
"""
import sys
import json
import requests
import csv
import pprint
import time
import random
import os
from sklearn.datasets import fetch_20newsgroups
from random import randrange
from pymongo import MongoClient
if os.environ.get("FLASK_PORT"):
FLASK_PORT = int(os.environ.get("FLASK_PORT"))
else:
FLASK_PORT = 5000
if os.environ.get("MONGO_PORT"):
MONGO_PORT = int(os.environ.get("MONGO_PORT"))
else:
MONGO_PORT = 27017
ENTRY_POINT = '127.0.0.1:{}'.format(FLASK_PORT)
OVERLAP = .15
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
data = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
#data.data, data.target
def create_collection(userid, labels):
collection = [{
'creator_id': userid,
'annotators': [userid],
'viewers': [userid],
'labels':labels,
'metadata': {'title':'Trial Collection', 'description':'This is a sample description of a collection'},
'archived': False,
'configuration': {
'allow_overlapping_ner_annotations': True
}
}]
r = perform_post('collections', json.dumps(collection))
return get_ids(r)
def create_documents(collection_id, user_id, num_docs):
data = fetch_20newsgroups(subset='test', categories=categories, shuffle=False)
docs = []
for i in range(num_docs):
docs.append({
'creator_id': user_id,
'collection_id': collection_id,
'overlap': 0,
'text': data.data[i]
})
r = perform_post('documents', json.dumps(docs))
print('Created:', len(docs), 'documents')
return get_ids(r)
def create_annotations(user_id, collection_id, doc_ids, categories):
data = fetch_20newsgroups(subset='test', categories=categories, shuffle=False)
annotations = []
for i, doc_id in enumerate(doc_ids):
annotations.append({
'creator_id': user_id,
'collection_id': collection_id,
'document_id': doc_id,
'annotation': [categories[data.target[i]]]
})
r = perform_post('annotations', json.dumps(annotations))
print('Created:', len(annotations), 'annotations')
return get_ids(r)
def update_annotations(annotation_ids):
#perform_get('annotations?where={"document_id": {"$in": ["5b3531a9aec9104c8a9aca9e", "5b3531a9aec9104c8a9acaa0"]}}'
for id in annotation_ids:
url = 'http://'+ENTRY_POINT+'/annotations/'+id
response = requests.get(url, headers={'Content-Type': 'application/json'})
if response.status_code == 200:
r = response.json()
etag = r['_etag']
headers = {'Content-Type': 'application/json', 'If-Match': etag}
data = {'annotation': ['soc.religion.christian']}
requests.patch('http://'+ENTRY_POINT+'/annotations/' + id, json.dumps(data), headers=headers)
def create_pipeline():
pipeline = [
{
"_id": "5babb6ee4eb7dd2c39b9671c",
"title": "Apache OpenNLP Named Entity Recognition",
"description": "Apache's open source natural language processing toolkit for named entity recognition (NER). See https://opennlp.apache.org/ for more information. This is the default pipeline used for NER.",
"name":"opennlp",
"parameters":{
"cutoff":"integer",
"iterations":"integer"
}
},
{
"_id": "5babb6ee4eb7dd2c39b9671d",
"title": "SpaCy Named Entity Recognition",
"description": "spaCy is a free open-source library for Natural Language Processing in Python. It features NER, POS tagging, dependency parsing, word vectors and more",
"name": "spaCy",
"parameters": {
"n_iter": "integer",
"dropout": "float"
}
},
{
"_id": "5babb6ee4eb7dd2c39b9671f",
"title": "Stanford CoreNLP Named Entity Recognition",
"description": "Stanford's natural language processing toolkit for named entity recognition (NER). See https://stanfordnlp.github.io/CoreNLP/ for more information.",
"name":"corenlp",
"parameters": {
"max_left":"integer",
"use_class_feature": [True, False],
"use_word": [True, False],
"use_ngrams": [True, False],
"no_mid_ngrams": [True, False],
"max_ngram_length":"integer",
"use_prev": [True, False],
"use_next": [True, False],
"use_disjunctive": [True, False],
"use_sequences": [True, False],
"use_prev_sequences": [True, False],
"use_type_seqs": [True, False],
"use_type_seqs2": [True, False],
"use_type_y_sequences": [True, False]
}
}
]
r = perform_post('pipelines', json.dumps(pipeline))
return get_ids(r)
def create_user():
users = []
users.append(
{
'_id':'bchee1',
'firstname': "Brant",
'lastname': 'Chee',
'email': 'bchee1@jhmi.edu',
'description': "Brant Developer",
'role': ['user']
})
users.append({
'_id': 'bchee2',
'firstname': "Brant",
'lastname': 'Chee',
'email': 'bchee2@jhmi.edu',
'description': "Brant administrator",
'role': ['administrator']
})
users.append(
{
'_id': 'lglende1',
'firstname': "Laura",
'lastname': 'Glendenning',
'email': 'lglende1@jh.edu',
'description': "Developer Laura",
'role': ['user']
})
users.append(
{
'_id': 'cahn9',
'firstname': "Charles",
'lastname': 'Ahn',
'email': 'cahn9@jh.edu',
'description': "Developer Charles",
'role': ['user']
})
r = perform_post('users', json.dumps(users))
return get_ids(r)
def create_classifier(collection_id, overlap, pipeline_id, labels):
'''{
'collection_id': {'type': 'objectid', 'required': True},
'overlap': {'type': 'float', 'required': True},
'pipeline_id': {'type': 'objectid', 'required': True},
'parameters': {'type': 'dict'}'''
classifier_obj = {'collection_id':collection_id,
'overlap':overlap,
'pipeline_id':pipeline_id,
'parameters':{"cutoff":1, "iterations":100},
'labels':labels
}
r = perform_post('classifiers', json.dumps(classifier_obj))
return get_ids(r)
def create_metrics(collection_id, classifier_id):
# create metrics for classifier
metrics_obj = {"collection_id": collection_id,
"classifier_id": classifier_id,
"documents": list(),
"annotations": list()
}
metrics_resp = perform_post("metrics", json.dumps(metrics_obj))
return get_ids(metrics_resp)
def create_next_ids(classifier_id, ann_ids, docids, overlap):
num_overlap = int(len(docids) * overlap)
#we're lazy and taking to first n docs as overlap
#'classifier_id': {'type': 'objectid', 'required': True},
#'document_ids': {'type': 'list', 'required': True},
#'overlap_document_ids': {'type': 'dict', 'required': True}
overlap_obj = {'classifier_id':classifier_id, 'document_ids':docids[num_overlap:], 'overlap_document_ids':{}}
for id in ann_ids:
overlap_obj['overlap_document_ids'][id] = docids[0:num_overlap]
r = perform_post('next_instances', json.dumps(overlap_obj))
return get_ids(r)
def get_ids(response):
valids = []
#print("Response:", response)
if response.status_code == 201:
r = response.json()
if r['_status'] == 'OK':
if '_items' in r:
for obj in r['_items']:
if obj['_status'] == "OK":
valids.append(obj['_id'])
else:
valids.append(r['_id'])
return valids
def perform_get(resource, data):
headers = {'Content-Type': 'application/json'}
return requests.get(endpoint(resource), data, headers=headers)
def perform_post(resource, data):
headers = {'Content-Type': 'application/json'}
return requests.post(endpoint(resource), data, headers=headers)
def endpoint(resource):
url = 'http://%s/%s/' % (
ENTRY_POINT if not sys.argv[1:] else sys.argv[1], resource)
return url
def delete_database(mongourl, database):
client = MongoClient(mongourl)
client.drop_database(database)
def create_bionlp_annotations(bionlpfile, num_docs, pipeline_id, creator_id, annotator_ids):
docs, anns, stats = load_bionlp(bionlpfile, num_docs)
categories = list(stats.keys())
#Create collection
collection = [{
'creator_id': creator_id,
'annotators': annotator_ids,
'viewers': annotator_ids,
'labels':categories,
'metadata': {'title':'NER Test Collection', 'description':'This is a sample sample collection to test NER tasks'},
'archived': False,
'configuration': {
'allow_overlapping_ner_annotations': True
}
}]
r = perform_post('collections', json.dumps(collection))
collection_id = get_ids(r)
collection_id = collection_id[0]
print("collection_id", collection_id)
#Create documents
images = [
'https://upload.wikimedia.org/wikipedia/commons/thumb/0/08/Unequalized_Hawkes_Bay_NZ.jpg/600px-Unequalized_Hawkes_Bay_NZ.jpg',
'https://cdn.indreams.me/cdf00b6d4827cd66511bdc35e1ef2ea3_10',
'/static/apl.png',
'/static/campus.jpg'
]
upload = []
for i in range(len(docs)):
upload.append({
'creator_id': creator_id,
'collection_id': collection_id,
'overlap': 0,
'text': docs[i],
'metadata': { 'imageUrl': images[randrange(0,len(images))] }
})
r = perform_post('documents', json.dumps(upload))
print('Created:', len(upload), 'documents')
doc_ids = get_ids(r)
classifier_ids = create_classifier(collection_id, 0, pipeline_id, categories)
print('Classifier id', classifier_ids)
metrics_ids = create_metrics(collection_id, classifier_ids[0])
print('Metrics id', metrics_ids)
next_ids = create_next_ids(classifier_ids[0], annotator_ids, doc_ids, 0)
annotations = []
for i, doc_id in enumerate(doc_ids):
annotations.append({
'creator_id': annotator_ids[random.randrange(0, len(annotator_ids))],
'collection_id': collection_id,
'document_id': doc_id,
'annotation': anns[i]
})
r = perform_post('annotations', json.dumps(annotations))
print('Created:', len(annotations), 'annotations')
return get_ids(r)
def load_bionlp(csvname, limit):
docs = []
anns = []
stats = {}
sentences_per_doc = 10 # total of 47959 sentences
with open(csvname, 'r', encoding='utf-8', errors='ignore') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
doc_text = ''
doc_anns = []
sentence_id = 1
next(csv_reader)
for line in csv_reader:
if line[0] != '':
sentence_id = int(line[0].split(':')[1])
# avoids triggering on the first sentence
if sentence_id % sentences_per_doc == 1 and sentence_id >= sentences_per_doc: # once you have enough sentences per doc, append to list and clear doc_text/doc_anns
# print('Added case ' + cases[-1])
docs.append(doc_text)
doc_text = ''
anns.append(doc_anns)
doc_anns = []
if len(docs) > limit-2:
break
token = line[1]
# add token to text and record start/end char
start_char = len(doc_text)
doc_text += token
end_char = len(doc_text)
doc_text += ' '
if line[3] != 'O': # if label is not 'O'
label = line[3].split('-')[1] # has BILUO tages that we don't need ex. 'B-tag'
if label not in stats:
stats[label] = 0
if line[3].split('-')[0] == 'B':
stats[label] += 1 # only add if the label has the 'begin' tag otherwise labels spanning multiple tokens are added multiple times
doc_anns.append((start_char, end_char, label)) # add label to annotations
elif line[3].split('-')[0] == 'I':
# NOTE: assumes I-tags only ever follow B-tags, will break if not the case
doc_anns.append((doc_anns[-1][0], end_char,
label)) # if the label spans multiple tokens update the most recent annotation with the new end char
del doc_anns[-2]
line_count += 1
# add remaining sentence
docs.append(doc_text)
doc_text = ''
anns.append(doc_anns)
doc_anns = []
return docs, anns, stats
if __name__ == '__main__':
mongourl = 'mongodb://localhost:{}'.format(MONGO_PORT)
delete_database(mongourl, 'test') # old database
delete_database(mongourl, 'pmap_nlp')
#generate new data
user_ids = create_user()
pipeline_ids = create_pipeline()
collection_id = create_collection(user_ids[1], categories)
doc_ids = create_documents(collection_id[0], user_ids[1], 750)
classifier_ids = create_classifier(collection_id[0], OVERLAP, pipeline_ids[0], categories)
metrics_ids = create_metrics(classifier_ids[0],collection_id[0])
next_ids = create_next_ids(classifier_ids[0], [user_ids[1]], doc_ids, OVERLAP)
annotation_ids = create_annotations(user_ids[0], collection_id[0], doc_ids[int(len(doc_ids)/2):], categories)
print("collection_id=",collection_id[0])
print("classifier_id='", classifier_ids[0], "'")
print("metrics_id='", metrics_ids[0], "'")
#update_annotations(annotation_ids[int(len(annotation_ids)/2):])
collection_id2 = create_collection(user_ids[1], categories)
doc_ids2 = create_documents(collection_id2[0], user_ids[1], 500)
annotation_ids2 = create_annotations(user_ids[0], collection_id[0], doc_ids2 , categories)
update_annotations(annotation_ids2[0:int(len(annotation_ids2) / 2)])
print('user_ids=',user_ids)
print('pipeline_ids=',pipeline_ids)
print('collection_id=',collection_id)
print('doc_ids=',doc_ids)
print('classifier_ids=',classifier_ids)
print('next_ids=',next_ids)
print('annotation_ids1=', annotation_ids)
print('annotation_ids2=',annotation_ids2)
print('collection_id2=', collection_id2)
print('doc_ids2=', doc_ids2)
ner_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'ner_dataset.csv')
#user_ids = create_user()
#pipeline_ids = create_pipeline()
#print('user_ids', user_ids)
#print('pipeline_ids', pipeline_ids)
create_bionlp_annotations(ner_file, 150, pipeline_ids[1], user_ids[0], user_ids)