Source code for pysumo.parser

""" The pySUMO parsing module. It contains functions to parse and serialize
kif-files. It also handles the Ontology's data structure and parses the mapping
of SUMO terms to WordNet.


This module contains:

- AbstractSyntaxTree: The in-memory representation of an Ontology.
- Ontology: Contains basic information about an Ontology.

"""

import re
from io import StringIO

from .logger import actionlog
from enum import Enum
from pickle import dumps

def _tokenize_docstring(chars, f):
    n = 0
    ret = []

    while  chars.count('"')%2 != 0 :
        n += 1
        line = f.readline()
        line = line.strip()
        chars = "".join([chars, line])
    chars = chars.split('"')
    while len(chars) > 1:
        c = _tokenize(_cleanup(chars.pop(0)))
        ret.extend(c)
        ret.append("".join(['"',chars.pop(0),'"']))
    ret.extend(_tokenize(_cleanup(chars.pop(0))))
    return (ret, n)


def _tokenize(chars):
    return chars.replace('(', ' ( ').replace(')', ' ) ').split()

def _cleanup(chars):
    if '"' in chars and ';' in chars and chars.find(";") > chars.find('"'):
        return chars
    chars = chars.split(";")
    chars = chars[0]
    chars = chars.strip()
    return chars

[docs]def kifparse(infile, ontology, ast=None): """ Parse an ontology and return an AbstractSyntaxTree. Args: - ontology: the ontology to parse - graph: a modified graph of this ontology - ast: the AST of ontologies which are needed from this ontology - infile: the file object to parse Returns: - AbstractSyntaxTree """ root = AbstractSyntaxTree(ontology) oldline = None linenumber = -1 offset = 0 for i, line in enumerate(infile): line = _cleanup(line) if line == "": continue if '"' in line: line, n = _tokenize_docstring(line, infile) offset += n else: line = _tokenize(line) if oldline != None: line = oldline + line oldline = None if line[0] != '(': raise ParseError(" ".join(line), i+1) if line.count('(') != line.count(')'): if linenumber == -1: linenumber = i + offset oldline = line continue if linenumber == -1: linenumber = i + offset while line != [] and line.count('(') == line.count(')'): node = AbstractSyntaxTree(ontology, line=linenumber) parsed = node.parse(line) line = line[parsed:] root.add_child(node) linenumber = -1 if oldline != None: raise ParseError(" ".join(oldline), linenumber) return root
[docs]def astmerge(trees): """ Merge two Abstract Syntax Trees Args: - trees: a tuple of 2 AST objects Returns: - AbstractSyntaxTree """ out = AbstractSyntaxTree(None) out.children.extend(trees[0].children) out.children.extend(trees[1].children) return out
[docs]def kifserialize(ast, ontology, out): """ Writes ontology to disk as kif. Parses ast and writes out all nodes that belong to ontology. Args: - ast: the Abstract Syntax Tree - ontology: The specific ontology with is written to disk - f: The file object witch written in Raises: - OSError """ for child in ast.children: if child.ontology != ontology: continue line = "".join([str(child), '\n']) out.write(line)
WORDNET_REGEX = re.compile(r'^(\d{8}) (\d{2}) ([nvasr]) ([0-9a-zA-Z]{2})(?: (\S+ ([0-9a-zA-Z])))+ (\d{3})(?: ((\S{1,2}) \d{8} [nvasr] [0-9a-zA-Z]{4}))*(?: \d{2} (\+ \d{2} [0-9a-zA-Z]{2} )+)? ?\| .+ &%.+[\][@+:=]$')
[docs]def wparse(datafiles): """ Parses the file containing the SUMO-WordNet mapping. Args: - path: The path to the SUMO-WordNet mapping files Returns: - Dictionary """ mapping = dict() total, processed = 0, 0 for data, pos in datafiles: data.seek(0) sdata = StringIO(data.read().decode('utf8')) for line in sdata: total += 1 #Syntactical validation if WORDNET_REGEX.match(line): items = _wtokenize(line.rstrip('\n'), pos) processed += 1 for item in items: try: mapping[item.sumo_concept].add(item) except KeyError: mapping[item.sumo_concept] = {item} #TODO: Remove the fixed assertions/replace with dynamic assertions assert total == 117939, '%d lines were read, but %d lines should have been read' % (total, 117939) assert processed >= 117659 - 2000, 'processed %d, should have processed %d' % (processed, 117659) return mapping
def _wtokenize(line, pos): """ Returns all the tokens of a WordNet data line. """ items = line.split(' ') # byte offset in current file synset_offset = int(items.pop(0)) lex_filenum = int(items.pop(0)) ss_type = SSType(items.pop(0)) w_cnt = int(items.pop(0), 16) synset = list() for i in range(0, w_cnt): word = items.pop(0) if pos == Pos.adj and word[-1] == ')': listy = word.split('(') syn_marker = listy.pop()[:-1] word = listy.pop() else: syn_marker = None lex_id = int(items.pop(0), 16) synset.append((word, syn_marker, lex_id)) assert len(synset) == w_cnt, 'line %s has %d synsets, but should have %d' % (line, w_cnt, len(synset)) p_cnt = int(items.pop(0)) ptr_list = list() for i in range(0, p_cnt): pointer_symbol = items.pop(0) synset_offset = int(items.pop(0)) p_pos = Pos(items.pop(0)) so_ta = items.pop(0) source = int(so_ta[:2], 16) target = int(so_ta[2:], 16) ptr_list.append((pointer_symbol, p_pos, synset_offset, source, target)) assert len(ptr_list) == p_cnt, 'line "%s" has %d pointers, but %s only contains %d' % (line, p_cnt, ptr_list, len(ptr_list)) frames = None if ss_type == SSType.verb: f_cnt = int(items.pop(0)) frames = set() for i in range(0, f_cnt): assert items.pop(0) == '+', "Frames not separated by a '+'" f_num = int(items.pop(0)) w_num = int(items.pop(0), 16) frames.add((f_num, w_num)) assert len(frames) == f_cnt, 'line %s has %d frames, but should have %d' % (line, f_cnt, len(frames)) assert items.pop(0) == '|', "Missing '|' separator" assert len(items) != 0, 'No gloss or SUMO-term in %s' % line string = ' '.join(items) assert string != '' items = string.split('&%') assert len(items) >= 2, '"%s": %s should contain at least 2 items, but contains %d' % (line, items, len(items)) gloss = items.pop(0).rstrip() sumo_concepts = set() while len(items) > 0: name = items.pop(0) suffix = name[-1:] name = name[:-1] sumo_concepts.add(SUMOConceptWordNetItem(name, suffix, synset_offset, lex_filenum, ss_type, synset, ptr_list, frames, gloss)) return sumo_concepts
[docs]class SUMOConceptWordNetItem: """ The object returned from _wtokenize containing info on the SUMO-WordNet mapping. """ def __init__(self, sumo_concept, suffix, synset_offset, lex_filenum, ss_type, synset, ptr_list, frames, gloss): self.sumo_concept = sumo_concept self.suffix = suffix self.synset_offset = synset_offset self.lex_filenum = lex_filenum self.ss_type = ss_type self.synset = synset self.ptr_list = ptr_list self.frames = frames self.gloss = gloss
[docs]class Pos(Enum): noun = 'n' verb = 'v' adj = 'a' adv = 'r'
[docs]class SSType(Enum): noun = 'n' verb = 'v' adj = 'a' adv = 'r' adj_sat = 's'
[docs]class AbstractSyntaxTree: """ The AbstractSyntaxTree is a node in the abstract syntax tree. The abstract syntax tree is defined by a root node and its children. The AbstractSyntaxTree is the in-memory representation of the loaded Ontologies for internal purposes only and should never be passed outside of the lib. Variables: - parent: The parent node - children: A list of child nodes. - name: The name of the AbstractSyntaxTree object. - element_type: The type of the node element. - ontology: The Ontology object to which this node corresponds. - is_indexed: Whether or not this node is indexed. Methods: - add_child: Adds a child node. - remove_child: Removes a child node. """ def __init__(self, ontology, parent=None, line=-1): self.parent = None self.children = [] self.name = '' self.element_type = '' self.ontology = ontology self.line = line def __repr__(self): if len(self.children) == 0: return self.name out = " ".join(["(", self.name, ""]) for child in self.children: out = "".join([out, str(child), " "]) out = "".join([out, ")"]) return out def __eq__(self, other): return dumps(self) == dumps(other) def __ne__(self, other): return not self.__eq__(other) def __hash__(self): return hash(dumps(self))
[docs] def parse(self, tokens): scip = 0 for i, token in enumerate(tokens): if scip > 0: scip -= 1 continue if token == '(': if self.name == '': self.name = tokens[i+1] scip += 1 else: child = AbstractSyntaxTree(self.ontology, parent=self, line=self.line) scip += child.parse(tokens[i:]) scip -= 1 self.add_child(child) elif token == ')': return i+1 else: child = AbstractSyntaxTree(self.ontology, parent=self, line=self.line) child.name = token self.add_child(child)
[docs] def add_child(self, entry): """ Adds entry as a child to self. """ # entry.parent = self self.children.append(entry)
[docs] def remove_child(self, entry): """ Removes entry from the node's children. """ self.children.remove(entry)
[docs]class ParseError(Exception): def __init__(self, line, linenumber): self.line = line self.linnumber = linenumber def __str__(self): return "".join(["Parse error in line ", str(self.linnumber), "\n", self.line])