Source code for pysumo.parser

""" The pySUMO parsing module. It contains functions to parse and serialize
kif-files. It also handles the Ontology's data structure and parses the mapping
of SUMO terms to WordNet.


This module contains:

- AbstractSyntaxTree: The in-memory representation of an Ontology.
- Ontology: Contains basic information about an Ontology.

"""

import re
from io import StringIO

from .logger import actionlog
from enum import Enum
from pickle import dumps

def _tokenize_docstring(chars, f):
    n = 0
    ret = []

    while  chars.count('"')%2 != 0 :
        n += 1
        line = f.readline()
        line = line.strip()
        chars = "".join([chars, line])
    chars = chars.split('"')
    while len(chars) > 1:
        c = _tokenize(_cleanup(chars.pop(0)))
        ret.extend(c)
        ret.append("".join(['"',chars.pop(0),'"']))
    ret.extend(_tokenize(_cleanup(chars.pop(0))))
    return (ret, n)


def _tokenize(chars):
    return chars.replace('(', ' ( ').replace(')', ' ) ').split()

def _cleanup(chars):
    if '"' in chars and ';' in chars and chars.find(";") > chars.find('"'):
        return chars
    chars = chars.split(";")
    chars = chars[0]
    chars = chars.strip()
    return chars

[docs]def kifparse(infile, ontology, ast=None):
    """ Parse an ontology and return an AbstractSyntaxTree.

    Args:

    - ontology: the ontology to parse
    - graph: a modified graph of this ontology
    - ast: the AST of ontologies which are needed from this ontology
    - infile: the file object to parse

    Returns:

    - AbstractSyntaxTree

    """
    root = AbstractSyntaxTree(ontology)
    oldline = None
    linenumber = -1
    offset = 0
    for i, line in enumerate(infile):
        line = _cleanup(line)
        if line == "":
            continue
        if '"' in line:
            line, n = _tokenize_docstring(line, infile)
            offset += n
        else:
            line = _tokenize(line)
        if oldline != None:
            line = oldline + line
            oldline = None
        if line[0] != '(':
            raise ParseError(" ".join(line),  i+1)
        if line.count('(') != line.count(')'):
            if linenumber == -1:
                linenumber = i + offset
            oldline = line
            continue
        if linenumber == -1:
            linenumber = i + offset
        while line != [] and line.count('(') == line.count(')'):
            node = AbstractSyntaxTree(ontology, line=linenumber)
            parsed = node.parse(line)
            line = line[parsed:]
            root.add_child(node)
        linenumber = -1
    if oldline != None:
        raise ParseError(" ".join(oldline), linenumber)
    return root

[docs]def astmerge(trees):
    """ Merge two Abstract Syntax Trees

    Args:

    - trees: a tuple of 2 AST objects

    Returns:

    - AbstractSyntaxTree

    """
    out = AbstractSyntaxTree(None)
    out.children.extend(trees[0].children)
    out.children.extend(trees[1].children)
    return out

[docs]def kifserialize(ast, ontology, out):
    """ Writes ontology to disk as kif. Parses ast and writes out all nodes
    that belong to ontology.

    Args:

    - ast: the Abstract Syntax Tree
    - ontology: The specific ontology with is written to disk
    - f: The file object witch written in

    Raises:

    - OSError

    """
    for child in ast.children:
        if child.ontology != ontology:
            continue
        line = "".join([str(child), '\n'])
        out.write(line)

WORDNET_REGEX = re.compile(r'^(\d{8}) (\d{2}) ([nvasr]) ([0-9a-zA-Z]{2})(?: (\S+ ([0-9a-zA-Z])))+ (\d{3})(?: ((\S{1,2}) \d{8} [nvasr] [0-9a-zA-Z]{4}))*(?: \d{2} (\+ \d{2} [0-9a-zA-Z]{2} )+)? ?\| .+ &%.+[\][@+:=]$')

[docs]def wparse(datafiles):
    """ Parses the file containing the SUMO-WordNet mapping.

    Args:

    - path: The path to the SUMO-WordNet mapping files

    Returns:

    - Dictionary

    """
    mapping = dict()
    total, processed = 0, 0
    for data, pos in datafiles:
        data.seek(0)
        sdata = StringIO(data.read().decode('utf8'))
        for line in sdata:
            total += 1
            #Syntactical validation
            if WORDNET_REGEX.match(line):
                items = _wtokenize(line.rstrip('\n'), pos)
                processed += 1
                for item in items:
                    try:
                        mapping[item.sumo_concept].add(item)
                    except KeyError:
                        mapping[item.sumo_concept] = {item}
    #TODO: Remove the fixed assertions/replace with dynamic assertions
    assert total == 117939, '%d lines were read, but %d lines should have been read' % (total, 117939)
    assert processed >= 117659 - 2000, 'processed %d, should have processed %d' % (processed, 117659)
    return mapping

def _wtokenize(line, pos):
    """ Returns all the tokens of a WordNet data line. """
    items = line.split(' ')
    # byte offset in current file
    synset_offset = int(items.pop(0))
    lex_filenum = int(items.pop(0))
    ss_type = SSType(items.pop(0))
    w_cnt = int(items.pop(0), 16)
    synset = list()
    for i in range(0, w_cnt):
        word = items.pop(0)
        if pos == Pos.adj and word[-1] == ')':
            listy = word.split('(')
            syn_marker = listy.pop()[:-1]
            word = listy.pop()
        else:
            syn_marker = None
        lex_id = int(items.pop(0), 16)
        synset.append((word, syn_marker, lex_id))
    assert len(synset) == w_cnt, 'line %s has %d synsets, but should have %d' % (line, w_cnt, len(synset))
    p_cnt = int(items.pop(0))
    ptr_list = list()
    for i in range(0, p_cnt):
        pointer_symbol = items.pop(0)
        synset_offset = int(items.pop(0))
        p_pos = Pos(items.pop(0))
        so_ta = items.pop(0)
        source = int(so_ta[:2], 16)
        target = int(so_ta[2:], 16)
        ptr_list.append((pointer_symbol, p_pos, synset_offset, source, target))
    assert len(ptr_list) == p_cnt, 'line "%s" has %d pointers, but %s only contains %d' % (line, p_cnt, ptr_list, len(ptr_list))
    frames = None
    if ss_type == SSType.verb:
        f_cnt = int(items.pop(0))
        frames = set()
        for i in range(0, f_cnt):
            assert items.pop(0) == '+', "Frames not separated by a '+'"
            f_num = int(items.pop(0))
            w_num = int(items.pop(0), 16)
            frames.add((f_num, w_num))
        assert len(frames) == f_cnt, 'line %s has %d frames, but should have %d' % (line, f_cnt, len(frames))
    assert items.pop(0) == '|', "Missing '|' separator"
    assert len(items) != 0, 'No gloss or SUMO-term in %s' % line
    string = ' '.join(items)
    assert string != ''
    items = string.split('&%')
    assert len(items) >= 2, '"%s": %s should contain at least 2 items, but contains %d' % (line, items, len(items))
    gloss = items.pop(0).rstrip()
    sumo_concepts = set()
    while len(items) > 0:
        name = items.pop(0)
        suffix = name[-1:]
        name = name[:-1]
        sumo_concepts.add(SUMOConceptWordNetItem(name, suffix, synset_offset, lex_filenum, ss_type, synset, ptr_list, frames, gloss))
    return sumo_concepts

[docs]class SUMOConceptWordNetItem:
    """ The object returned from _wtokenize containing info on the SUMO-WordNet mapping. """
    def __init__(self, sumo_concept, suffix, synset_offset, lex_filenum,
                 ss_type, synset, ptr_list, frames, gloss):
        self.sumo_concept = sumo_concept
        self.suffix = suffix
        self.synset_offset = synset_offset
        self.lex_filenum = lex_filenum
        self.ss_type = ss_type
        self.synset = synset
        self.ptr_list = ptr_list
        self.frames = frames
        self.gloss = gloss

[docs]class Pos(Enum):
    noun = 'n'
    verb = 'v'
    adj = 'a'
    adv = 'r'

[docs]class SSType(Enum):
    noun = 'n'
    verb = 'v'
    adj = 'a'
    adv = 'r'
    adj_sat = 's'

[docs]class AbstractSyntaxTree:
    """ The AbstractSyntaxTree is a node in the abstract syntax tree. The
    abstract syntax tree is defined by a root node and its children. The
    AbstractSyntaxTree is the in-memory representation of the loaded Ontologies
    for internal purposes only and should never be passed outside of the lib.

    Variables:

    - parent: The parent node
    - children: A list of child nodes.
    - name: The name of the AbstractSyntaxTree object.
    - element_type: The type of the node element.
    - ontology: The Ontology object to which this node corresponds.
    - is_indexed: Whether or not this node is indexed.

    Methods:

    - add_child: Adds a child node.
    - remove_child: Removes a child node.

    """

    def __init__(self, ontology, parent=None, line=-1):
        self.parent = None
        self.children = []
        self.name = ''
        self.element_type = ''
        self.ontology = ontology
        self.line = line

    def __repr__(self):
        if len(self.children) == 0:
            return self.name
        out = " ".join(["(", self.name, ""])
        for child in self.children:
            out = "".join([out, str(child), " "])
        out = "".join([out, ")"])
        return out

    def __eq__(self, other):
        return dumps(self) == dumps(other)

    def __ne__(self, other):
        return not self.__eq__(other)

    def __hash__(self):
        return hash(dumps(self))

[docs]    def parse(self, tokens):
        scip = 0
        for i, token in enumerate(tokens):
            if scip > 0:
                scip -= 1
                continue
            if token == '(':
                if self.name == '':
                    self.name = tokens[i+1]
                    scip += 1
                else:
                    child = AbstractSyntaxTree(self.ontology, parent=self, line=self.line)
                    scip += child.parse(tokens[i:])
                    scip -= 1
                    self.add_child(child)
            elif token == ')':
                return i+1
            else:
                child = AbstractSyntaxTree(self.ontology, parent=self, line=self.line)
                child.name = token
                self.add_child(child)

[docs]    def add_child(self, entry):
        """ Adds entry as a child to self. """
        # entry.parent = self
        self.children.append(entry)

[docs]    def remove_child(self, entry):
        """ Removes entry from the node's children. """
        self.children.remove(entry)

[docs]class ParseError(Exception):
    def __init__(self, line, linenumber):
        self.line = line
        self.linnumber = linenumber

    def __str__(self):
        return "".join(["Parse error in line ", str(self.linnumber), "\n", self.line])