Source code for processors.odin

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from .utils import post_json
from .ds import Document, Interval, NLPDatum
from termcolor import colored
import re
import json


class OdinHighlighter(object):

    @staticmethod
    def LABEL(token):
        return colored(token, color="red", attrs=["bold"])

    @staticmethod
    def ARG(token):
        return colored(token, on_color="on_green", attrs=["bold"])

    @staticmethod
    def TRIGGER(token):
        return colored(token, on_color="on_blue", attrs=["bold"])

    @staticmethod
    def CONCEAL(token):
        return colored(token, on_color="on_grey", attrs=["concealed"])

    @staticmethod
    def MENTION(token):
        return colored(token, on_color="on_yellow")

    @staticmethod
    def highlight_mention(mention):
        """
        Formats text of mention
        """
        text_span = mention.sentenceObj.words[:]
        # format TBM span like an arg
        if mention.type == "TextBoundMention":
            for i in range(mention.start, mention.end):
                text_span[i] = OdinHighlighter.ARG(text_span[i])
        if mention.arguments:
            for (role, args) in mention.arguments.items():
                for arg in args:
                    for i in range(arg.start, arg.end):
                        text_span[i] = OdinHighlighter.ARG(text_span[i])
        # format trigger distinctly from args
        if mention.trigger:
            trigger = mention.trigger
            for i in range(trigger.start, trigger.end):
                text_span[i] = OdinHighlighter.TRIGGER(text_span[i])

        # highlight tokens contained in mention span
        for i in range(mention.start, mention.end):
            text_span[i] = OdinHighlighter.MENTION(text_span[i])
        mention_span = OdinHighlighter.MENTION(" ").join(text_span[mention.start:mention.end])
        # highlight spaces in mention span
        formatted_text = " ".join(text_span[:mention.start]) + " " + mention_span + " " + " ".join(text_span[mention.end:])
        return formatted_text.strip()

[docs]class Mention(NLPDatum):
    """
    A labeled span of text.  Used to model textual mentions of events, relations, and entities.

    Parameters
    ----------
    token_interval : Interval
        The span of the Mention represented as an Interval.
    sentence : int
        The sentence index that contains the Mention.
    document : Document
        The Document in which the Mention was found.
    foundBy : str
        The Odin IE rule that produced this Mention.
    label : str
        The label most closely associated with this span.  Usually the lowest hyponym of "labels".
    labels: list
        The list of labels associated with this span.
    trigger: dict or None
        dict of JSON for Mention's trigger (event predicate or word(s) signaling the Mention).
    arguments: dict or None
        dict of JSON for Mention's arguments.
    paths: dict or None
        dict of JSON encoding the syntactic paths linking a Mention's arguments to its trigger (applies to Mentions produces from `type:"dependency"` rules).
    doc_id: str or None
        the id of the document

    Attributes
    ----------
    tokenInterval: processors.ds.Interval
        An `Interval` encoding the `start` and `end` of the `Mention`.
    start : int
        The token index that starts the `Mention`.
    end : int
        The token index that marks the end of the Mention (exclusive).
    sentenceObj : processors.ds.Sentence
        Pointer to the `Sentence` instance containing the `Mention`.
    characterStartOffset: int
        The index of the character that starts the `Mention`.
    characterEndOffset: int
        The index of the character that ends the `Mention`.
    type: Mention.TBM or Mention.EM or Mention.RM
        The type of the `Mention`.

    See Also
    --------

    [`Odin` manual](https://arxiv.org/abs/1509.07513)

    Methods
    -------
    matches(label_pattern)
        Test if the provided pattern, `label_pattern`, matches any element in `Mention.labels`.

    overlaps(other)
        Test whether other (token index or Mention) overlaps with span of this Mention.

    copy(**kwargs)
        Copy constructor for this Mention.

    words()
        Words for this Mention's span.

    tags()
        Part of speech for this Mention's span.

    lemmas()
        Lemmas for this Mention's span.

    _chunks()
        chunk labels for this Mention's span.

    _entities()
        NE labels for this Mention's span.
    """

    TBM = "TextBoundMention"
    EM = "EventMention"
    RM = "RelationMention"

    def __init__(self,
                token_interval,
                sentence,
                document,
                foundBy,
                label,
                labels=None,
                trigger=None,
                arguments=None,
                paths=None,
                keep=True,
                doc_id=None):

        NLPDatum.__init__(self)
        self.label = label
        self.labels = labels if labels else [self.label]
        self.tokenInterval = token_interval
        self.start = self.tokenInterval.start
        self.end = self.tokenInterval.end
        self.document = document
        self._doc_id = doc_id or hash(self.document)
        self.sentence = sentence
        if trigger:
            # NOTE: doc id is not stored for trigger's json,
            # as it is assumed to be contained in the same document as its parent
            trigger.update({"document": self._doc_id})
            self.trigger = Mention.load_from_JSON(trigger, self._to_document_map())
        else:
            self.trigger = None
        # unpack args
        self.arguments = {role:[Mention.load_from_JSON(a, self._to_document_map()) for a in args] for (role, args) in arguments.items()} if arguments else None
        self.paths = paths
        self.keep = keep
        self.foundBy = foundBy
        # other
        self.sentenceObj = self.document.sentences[self.sentence]
        self.text = " ".join(self.sentenceObj.words[self.start:self.end])
        # recover offsets
        self.characterStartOffset = self.sentenceObj.startOffsets[self.tokenInterval.start]
        self.characterEndOffset = self.sentenceObj.endOffsets[self.tokenInterval.end - 1]
        # for later recovery
        self.id = None
        self.type = self._set_type()

    def __str__(self):
        return "{}: {}".format(OdinHighlighter.LABEL(self.label), OdinHighlighter.highlight_mention(self))

    def __eq__(self, other):
        if isinstance(other, self.__class__):
            return self.__dict__ == other.__dict__
        else:
            return False

    def __ne__(self, other):
        return not self.__eq__(other)

    def __hash__(self):
        return hash(self.to_JSON())

    def to_JSON_dict(self):
        m = dict()
        m["id"] = self.id
        m["type"] = self.type
        m["label"] = self.label
        m["labels"] = self.labels
        m["tokenInterval"] = self.tokenInterval.to_JSON_dict()
        m["characterStartOffset"] = self.characterStartOffset
        m["characterEndOffset"] = self.characterEndOffset
        m["sentence"] = self.sentence
        m["document"] = self._doc_id
        # do we have a trigger?
        if self.trigger:
             m["trigger"] = self.trigger.to_JSON_dict()
        # do we have arguments?
        if self.arguments:
            m["arguments"] = self._arguments_to_JSON_dict()
        # handle paths
        if self.paths:
            m["paths"] = self.paths
        m["keep"] = self.keep
        m["foundBy"] = self.foundBy
        return m

    def startOffset(self):
        return self.sentenceObj.endOffsets[self.start]

    def endOffset(self):
        return self.sentenceObj.endOffsets[self.end -1]

    def words(self):
        return self.sentenceObj.words[self.start:self.end]

    def tags(self):
        return self.sentenceObj.tags[self.start:self.end]

    def lemmas(self):
        return self.sentenceObj.lemmas[self.start:self.end]

    def _chunks(self):
        return self.sentenceObj._chunks[self.start:self.end]

    def _entities(self):
        return self.sentenceObj._entities[self.start:self.end]

    def copy(self, **kwargs):
        """
        Copy constructor for mention
        """
        # return new instance
        return self.__class__(
            label=kwargs.get("label", self.label),
            labels=kwargs.get("label", self.labels),
            token_interval=kwargs.get("token_interval", self.tokenInterval),
            sentence=kwargs.get("sentence", self.sentence), # NOTE: this is the sentence idx
            document=kwargs.get("document", self.document),
            foundBy=kwargs.get("foundBy", self.foundBy),
            trigger=kwargs.get("trigger", self.trigger),
            arguments=kwargs.get("arguments", self.arguments),
            paths=kwargs.get("paths", self.paths),
            keep=kwargs.get("keep", self.keep),
            doc_id=kwargs.get("doc_id", self._doc_id)
        )

    def overlaps(self, other):
        """
        Checks for overlap.
        """
        if isinstance(other, int):
            return self.start <= other < self.end
        elif isinstance(other, Mention):
            # equiv. sentences + checks on start and end
            return (self.sentence.__hash__() == other.sentence.__hash__()) and \
            self.tokenInterval.overlaps(other.tokenInterval)
        else:
            return False

    def matches(self, label_pattern):
        """
        Test if the provided pattern, `label_pattern`, matches any element in `Mention.labels`.

        Parameters
        ----------
        label_pattern : str or _sre.SRE_Pattern
            The pattern to match against each element in `Mention.labels`

        Returns
        -------
        bool
            True if `label_pattern` matches any element in `Mention.labels`
        """
        return any(re.match(label_pattern, label) for label in self.labels)

    def _arguments_to_JSON_dict(self):
        return dict((role, [a.to_JSON_dict() for a in args]) for (role, args) in self.arguments.items())

    def _paths_to_JSON_dict(self):
        return {role: paths.to_JSON_dict() for (role, paths) in self.paths}

    @staticmethod
    def load_from_JSON(mjson, docs_dict):
        # recover document
        doc_id = mjson["document"]
        doc = docs_dict[doc_id]
        labels = mjson["labels"]
        kwargs = {
            "label": mjson.get("label", labels[0]),
            "labels": labels,
            "token_interval": Interval.load_from_JSON(mjson["tokenInterval"]),
            "sentence": mjson["sentence"],
            "document": doc,
            "doc_id": doc_id,
            "trigger": mjson.get("trigger", None),
            "arguments": mjson.get("arguments", None),
            "paths": mjson.get("paths", None),
            "keep": mjson.get("keep", True),
            "foundBy": mjson["foundBy"]
        }
        m = Mention(**kwargs)
        # set IDs
        m.id = mjson["id"]
        m._doc_id = doc_id
        # set character offsets
        m.character_start_offset = mjson["characterStartOffset"]
        m.character_end_offset = mjson["characterEndOffset"]
        return m

    def _to_document_map(self):
        return {self._doc_id: self.document}

    def _set_type(self):
        # event mention
        if self.trigger != None:
            return Mention.EM
        # textbound mention
        elif self.trigger == None and self.arguments == None:
            return Mention.TBM
        else:
            return Mention.RM