Source code for processors.annotators

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# use data structures
from __future__ import unicode_literals
from processors.ds import Document, Sentence, DirectedGraph
from processors.utils import post_json
import json


class Processor(object):
    """
    Base Processor for text annotation (tokenization, sentence splitting,
    parsing, lemmatization, PoS tagging, named entity recognition, chunking, etc.).

    Parameters
    ----------
    address : str
        The base address for the API (i.e., everything preceding `/api/..`)


    Attributes
    ----------
    service : str
        The API endpoint for `annotate` requests.

    Methods
    -------
    annotate(text)
        Produces an annotated `Document` from the provided text.
    annotate_from_sentences(sentences)
        Produces an annotated `Document` from a [str] of text already split into sentences.

    """
    def __init__(self, address):
        self.service = "{}/api/annotate".format(address)

    def _message_to_json_dict(self, msg):
        return post_json(self.service, msg.to_JSON())

    def _annotate_message(self, msg):
        annotated_text = post_json(self.service, msg.to_JSON())
        return Document.load_from_JSON(annotated_text)

    def annotate(self, text):
        """
        Annotate text (tokenization, sentence splitting,
        parsing, lemmatization, PoS tagging, named entity recognition, chunking, etc.)

        Parameters
        ----------
        text : str
            `text` to be annotated.

        Returns
        -------
        processors.ds.Document or None
            An annotated Document composed of `sentences`.
        """
        try:
            # load json and build Sentences and Document
            msg = Message(text)
            return self._annotate_message(msg)

        except Exception as e:
            #print(e)
            return None

    def annotate_from_sentences(self, sentences):
        """
        Annotate text that has already been segmented into `sentences`.

        Parameters
        ----------
        sentences : [str]
            A list of str representing text already split into sentences.

        Returns
        -------
        processors.ds.Document or None
            An annotated `Document` composed of `sentences`.
        """
        try:
            # load json from str interable and build Sentences and Document
            msg = SegmentedMessage(sentences)
            return self._annotate_message(msg)

        except Exception as e:
            #print(e)
            return None

[docs]class CluProcessor(Processor): """ Processor for text annotation based on [`org.clulab.processors.clu.CluProcessor`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/clu/CluProcessor.scala) Uses the Malt parser. """ def __init__(self, address): self.service = "{}/api/clu/annotate".format(address) def annotate(self, text): return super(CluProcessor, self).annotate(text)
[docs]class FastNLPProcessor(Processor): """ Processor for text annotation based on [`org.clulab.processors.fastnlp.FastNLPProcessor`](https://github.com/clulab/processors/blob/master/corenlp/src/main/scala/org/clulab/processors/fastnlp/FastNLPProcessor.scala) Uses the Stanford CoreNLP neural network parser. """ def __init__(self, address): self.address = address self.service = "{}/api/fastnlp/annotate".format(address) self.chunk_address = "{}/api/fastnlp/chunk".format(self.address) def annotate(self, text): return super(FastNLPProcessor, self).annotate(text) def _chunk(self, obj): return post_json(self.chunk_address, obj.to_JSON()) def chunk_sentence(self, sentence): res = self._chunk(sentence) return Sentence.load_from_JSON(res) def chunk_document(self, doc): res = self._chunk(doc) return Document.load_from_JSON(res)
[docs]class BioNLPProcessor(Processor): """ Processor for biomedical text annotation based on [`org.clulab.processors.fastnlp.FastNLPProcessor`](https://github.com/clulab/processors/blob/master/corenlp/src/main/scala/org/clulab/processors/fastnlp/FastNLPProcessor.scala) CoreNLP-derived annotator. """ def __init__(self, address): self.service = "{}/api/bionlp/annotate".format(address) def annotate(self, text): return super(BioNLPProcessor, self).annotate(text)
class Message(object): """ A storage class for passing `text` to API `annotate` endpoint. Attributes ---------- text : str The `text` to be annotated. Methods ------- to_JSON() Produces a json str in the structure expected by the API `annotate` endpoint. """ def __init__(self, text): self.text = text def to_JSON_dict(self): jdict = dict() jdict["text"] = self.text return jdict def to_JSON(self): return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4) class SegmentedMessage(object): """ A storage class for passing text already split into sentences to API `annotate` endpoint. Attributes ---------- segments : [str] Text to be annotated that has already been split into sentences. This segmentation is preserved during annotation. Methods ------- to_JSON() Produces a json str in the structure expected by the API `annotate` endpoint. """ def __init__(self, segments): self.segments = segments def to_JSON_dict(self): jdict = dict() jdict["segments"] = self.segments return jdict def to_JSON(self): return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)