Source code for processors.odin

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from .utils import post_json
from .ds import Document, Interval, NLPDatum
from termcolor import colored
import re
import json


class OdinHighlighter(object):

    @staticmethod
    def LABEL(token):
        return colored(token, color="red", attrs=["bold"])

    @staticmethod
    def ARG(token):
        return colored(token, on_color="on_green", attrs=["bold"])

    @staticmethod
    def TRIGGER(token):
        return colored(token, on_color="on_blue", attrs=["bold"])

    @staticmethod
    def CONCEAL(token):
        return colored(token, on_color="on_grey", attrs=["concealed"])

    @staticmethod
    def MENTION(token):
        return colored(token, on_color="on_yellow")

    @staticmethod
    def highlight_mention(mention):
        """
        Formats text of mention
        """
        text_span = mention.sentenceObj.words[:]
        # format TBM span like an arg
        if mention.type == "TextBoundMention":
            for i in range(mention.start, mention.end):
                text_span[i] = OdinHighlighter.ARG(text_span[i])
        if mention.arguments:
            for (role, args) in mention.arguments.items():
                for arg in args:
                    for i in range(arg.start, arg.end):
                        text_span[i] = OdinHighlighter.ARG(text_span[i])
        # format trigger distinctly from args
        if mention.trigger:
            trigger = mention.trigger
            for i in range(trigger.start, trigger.end):
                text_span[i] = OdinHighlighter.TRIGGER(text_span[i])

        # highlight tokens contained in mention span
        for i in range(mention.start, mention.end):
            text_span[i] = OdinHighlighter.MENTION(text_span[i])
        mention_span = OdinHighlighter.MENTION(" ").join(text_span[mention.start:mention.end])
        # highlight spaces in mention span
        formatted_text = " ".join(text_span[:mention.start]) + " " + mention_span + " " + " ".join(text_span[mention.end:])
        return formatted_text.strip()

[docs]class Mention(NLPDatum): """ A labeled span of text. Used to model textual mentions of events, relations, and entities. Parameters ---------- token_interval : Interval The span of the Mention represented as an Interval. sentence : int The sentence index that contains the Mention. document : Document The Document in which the Mention was found. foundBy : str The Odin IE rule that produced this Mention. label : str The label most closely associated with this span. Usually the lowest hyponym of "labels". labels: list The list of labels associated with this span. trigger: dict or None dict of JSON for Mention's trigger (event predicate or word(s) signaling the Mention). arguments: dict or None dict of JSON for Mention's arguments. paths: dict or None dict of JSON encoding the syntactic paths linking a Mention's arguments to its trigger (applies to Mentions produces from `type:"dependency"` rules). doc_id: str or None the id of the document Attributes ---------- tokenInterval: processors.ds.Interval An `Interval` encoding the `start` and `end` of the `Mention`. start : int The token index that starts the `Mention`. end : int The token index that marks the end of the Mention (exclusive). sentenceObj : processors.ds.Sentence Pointer to the `Sentence` instance containing the `Mention`. characterStartOffset: int The index of the character that starts the `Mention`. characterEndOffset: int The index of the character that ends the `Mention`. type: Mention.TBM or Mention.EM or Mention.RM The type of the `Mention`. See Also -------- [`Odin` manual](https://arxiv.org/abs/1509.07513) Methods ------- matches(label_pattern) Test if the provided pattern, `label_pattern`, matches any element in `Mention.labels`. overlaps(other) Test whether other (token index or Mention) overlaps with span of this Mention. copy(**kwargs) Copy constructor for this Mention. words() Words for this Mention's span. tags() Part of speech for this Mention's span. lemmas() Lemmas for this Mention's span. _chunks() chunk labels for this Mention's span. _entities() NE labels for this Mention's span. """ TBM = "TextBoundMention" EM = "EventMention" RM = "RelationMention" def __init__(self, token_interval, sentence, document, foundBy, label, labels=None, trigger=None, arguments=None, paths=None, keep=True, doc_id=None): NLPDatum.__init__(self) self.label = label self.labels = labels if labels else [self.label] self.tokenInterval = token_interval self.start = self.tokenInterval.start self.end = self.tokenInterval.end self.document = document self._doc_id = doc_id or hash(self.document) self.sentence = sentence if trigger: # NOTE: doc id is not stored for trigger's json, # as it is assumed to be contained in the same document as its parent trigger.update({"document": self._doc_id}) self.trigger = Mention.load_from_JSON(trigger, self._to_document_map()) else: self.trigger = None # unpack args self.arguments = {role:[Mention.load_from_JSON(a, self._to_document_map()) for a in args] for (role, args) in arguments.items()} if arguments else None self.paths = paths self.keep = keep self.foundBy = foundBy # other self.sentenceObj = self.document.sentences[self.sentence] self.text = " ".join(self.sentenceObj.words[self.start:self.end]) # recover offsets self.characterStartOffset = self.sentenceObj.startOffsets[self.tokenInterval.start] self.characterEndOffset = self.sentenceObj.endOffsets[self.tokenInterval.end - 1] # for later recovery self.id = None self.type = self._set_type() def __str__(self): return "{}: {}".format(OdinHighlighter.LABEL(self.label), OdinHighlighter.highlight_mention(self)) def __eq__(self, other): if isinstance(other, self.__class__): return self.__dict__ == other.__dict__ else: return False def __ne__(self, other): return not self.__eq__(other) def __hash__(self): return hash(self.to_JSON()) def to_JSON_dict(self): m = dict() m["id"] = self.id m["type"] = self.type m["label"] = self.label m["labels"] = self.labels m["tokenInterval"] = self.tokenInterval.to_JSON_dict() m["characterStartOffset"] = self.characterStartOffset m["characterEndOffset"] = self.characterEndOffset m["sentence"] = self.sentence m["document"] = self._doc_id # do we have a trigger? if self.trigger: m["trigger"] = self.trigger.to_JSON_dict() # do we have arguments? if self.arguments: m["arguments"] = self._arguments_to_JSON_dict() # handle paths if self.paths: m["paths"] = self.paths m["keep"] = self.keep m["foundBy"] = self.foundBy return m def startOffset(self): return self.sentenceObj.endOffsets[self.start] def endOffset(self): return self.sentenceObj.endOffsets[self.end -1] def words(self): return self.sentenceObj.words[self.start:self.end] def tags(self): return self.sentenceObj.tags[self.start:self.end] def lemmas(self): return self.sentenceObj.lemmas[self.start:self.end] def _chunks(self): return self.sentenceObj._chunks[self.start:self.end] def _entities(self): return self.sentenceObj._entities[self.start:self.end] def copy(self, **kwargs): """ Copy constructor for mention """ # return new instance return self.__class__( label=kwargs.get("label", self.label), labels=kwargs.get("label", self.labels), token_interval=kwargs.get("token_interval", self.tokenInterval), sentence=kwargs.get("sentence", self.sentence), # NOTE: this is the sentence idx document=kwargs.get("document", self.document), foundBy=kwargs.get("foundBy", self.foundBy), trigger=kwargs.get("trigger", self.trigger), arguments=kwargs.get("arguments", self.arguments), paths=kwargs.get("paths", self.paths), keep=kwargs.get("keep", self.keep), doc_id=kwargs.get("doc_id", self._doc_id) ) def overlaps(self, other): """ Checks for overlap. """ if isinstance(other, int): return self.start <= other < self.end elif isinstance(other, Mention): # equiv. sentences + checks on start and end return (self.sentence.__hash__() == other.sentence.__hash__()) and \ self.tokenInterval.overlaps(other.tokenInterval) else: return False def matches(self, label_pattern): """ Test if the provided pattern, `label_pattern`, matches any element in `Mention.labels`. Parameters ---------- label_pattern : str or _sre.SRE_Pattern The pattern to match against each element in `Mention.labels` Returns ------- bool True if `label_pattern` matches any element in `Mention.labels` """ return any(re.match(label_pattern, label) for label in self.labels) def _arguments_to_JSON_dict(self): return dict((role, [a.to_JSON_dict() for a in args]) for (role, args) in self.arguments.items()) def _paths_to_JSON_dict(self): return {role: paths.to_JSON_dict() for (role, paths) in self.paths} @staticmethod def load_from_JSON(mjson, docs_dict): # recover document doc_id = mjson["document"] doc = docs_dict[doc_id] labels = mjson["labels"] kwargs = { "label": mjson.get("label", labels[0]), "labels": labels, "token_interval": Interval.load_from_JSON(mjson["tokenInterval"]), "sentence": mjson["sentence"], "document": doc, "doc_id": doc_id, "trigger": mjson.get("trigger", None), "arguments": mjson.get("arguments", None), "paths": mjson.get("paths", None), "keep": mjson.get("keep", True), "foundBy": mjson["foundBy"] } m = Mention(**kwargs) # set IDs m.id = mjson["id"] m._doc_id = doc_id # set character offsets m.character_start_offset = mjson["characterStartOffset"] m.character_end_offset = mjson["characterEndOffset"] return m def _to_document_map(self): return {self._doc_id: self.document} def _set_type(self): # event mention if self.trigger != None: return Mention.EM # textbound mention elif self.trigger == None and self.arguments == None: return Mention.TBM else: return Mention.RM