123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345 |
- # Natural Language Toolkit: Combinatory Categorial Grammar
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Graeme Gange <ggange@csse.unimelb.edu.au>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- CCG Lexicons
- """
- from __future__ import unicode_literals
- import re
- from collections import defaultdict
- from nltk.ccg.api import PrimitiveCategory, Direction, CCGVar, FunctionalCategory
- from nltk.compat import python_2_unicode_compatible
- from nltk.internals import deprecated
- from nltk.sem.logic import Expression
- # ------------
- # Regular expressions used for parsing components of the lexicon
- # ------------
- # Parses a primitive category and subscripts
- PRIM_RE = re.compile(r'''([A-Za-z]+)(\[[A-Za-z,]+\])?''')
- # Separates the next primitive category from the remainder of the
- # string
- NEXTPRIM_RE = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''')
- # Separates the next application operator from the remainder
- APP_RE = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''')
- # Parses the definition of the right-hand side (rhs) of either a word or a family
- LEX_RE = re.compile(r'''([\S_]+)\s*(::|[-=]+>)\s*(.+)''', re.UNICODE)
- # Parses the right hand side that contains category and maybe semantic predicate
- RHS_RE = re.compile(r'''([^{}]*[^ {}])\s*(\{[^}]+\})?''', re.UNICODE)
- # Parses the semantic predicate
- SEMANTICS_RE = re.compile(r'''\{([^}]+)\}''', re.UNICODE)
- # Strips comments from a line
- COMMENTS_RE = re.compile('''([^#]*)(?:#.*)?''')
- class Token(object):
- """
- Class representing a token.
- token => category {semantics}
- e.g. eat => S\\var[pl]/var {\\x y.eat(x,y)}
- * `token` (string)
- * `categ` (string)
- * `semantics` (Expression)
- """
- def __init__(self, token, categ, semantics=None):
- self._token = token
- self._categ = categ
- self._semantics = semantics
- def categ(self):
- return self._categ
- def semantics(self):
- return self._semantics
- def __str__(self):
- semantics_str = ""
- if self._semantics is not None:
- semantics_str = " {" + str(self._semantics) + "}"
- return "" + str(self._categ) + semantics_str
- def __cmp__(self, other):
- if not isinstance(other, Token):
- return -1
- return cmp((self._categ, self._semantics), other.categ(), other.semantics())
- @python_2_unicode_compatible
- class CCGLexicon(object):
- """
- Class representing a lexicon for CCG grammars.
- * `primitives`: The list of primitive categories for the lexicon
- * `families`: Families of categories
- * `entries`: A mapping of words to possible categories
- """
- def __init__(self, start, primitives, families, entries):
- self._start = PrimitiveCategory(start)
- self._primitives = primitives
- self._families = families
- self._entries = entries
- def categories(self, word):
- """
- Returns all the possible categories for a word
- """
- return self._entries[word]
- def start(self):
- """
- Return the target category for the parser
- """
- return self._start
- def __str__(self):
- """
- String representation of the lexicon. Used for debugging.
- """
- string = ""
- first = True
- for ident in sorted(self._entries):
- if not first:
- string = string + "\n"
- string = string + ident + " => "
- first = True
- for cat in self._entries[ident]:
- if not first:
- string = string + " | "
- else:
- first = False
- string = string + "%s" % cat
- return string
- # -----------
- # Parsing lexicons
- # -----------
- def matchBrackets(string):
- """
- Separate the contents matching the first set of brackets from the rest of
- the input.
- """
- rest = string[1:]
- inside = "("
- while rest != "" and not rest.startswith(')'):
- if rest.startswith('('):
- (part, rest) = matchBrackets(rest)
- inside = inside + part
- else:
- inside = inside + rest[0]
- rest = rest[1:]
- if rest.startswith(')'):
- return (inside + ')', rest[1:])
- raise AssertionError('Unmatched bracket in string \'' + string + '\'')
- def nextCategory(string):
- """
- Separate the string for the next portion of the category from the rest
- of the string
- """
- if string.startswith('('):
- return matchBrackets(string)
- return NEXTPRIM_RE.match(string).groups()
- def parseApplication(app):
- """
- Parse an application operator
- """
- return Direction(app[0], app[1:])
- def parseSubscripts(subscr):
- """
- Parse the subscripts for a primitive category
- """
- if subscr:
- return subscr[1:-1].split(',')
- return []
- def parsePrimitiveCategory(chunks, primitives, families, var):
- """
- Parse a primitive category
- If the primitive is the special category 'var', replace it with the
- correct `CCGVar`.
- """
- if chunks[0] == "var":
- if chunks[1] is None:
- if var is None:
- var = CCGVar()
- return (var, var)
- catstr = chunks[0]
- if catstr in families:
- (cat, cvar) = families[catstr]
- if var is None:
- var = cvar
- else:
- cat = cat.substitute([(cvar, var)])
- return (cat, var)
- if catstr in primitives:
- subscrs = parseSubscripts(chunks[1])
- return (PrimitiveCategory(catstr, subscrs), var)
- raise AssertionError(
- 'String \'' + catstr + '\' is neither a family nor primitive category.'
- )
- def augParseCategory(line, primitives, families, var=None):
- """
- Parse a string representing a category, and returns a tuple with
- (possibly) the CCG variable for the category
- """
- (cat_string, rest) = nextCategory(line)
- if cat_string.startswith('('):
- (res, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
- else:
- # print rePrim.match(str).groups()
- (res, var) = parsePrimitiveCategory(
- PRIM_RE.match(cat_string).groups(), primitives, families, var
- )
- while rest != "":
- app = APP_RE.match(rest).groups()
- direction = parseApplication(app[0:3])
- rest = app[3]
- (cat_string, rest) = nextCategory(rest)
- if cat_string.startswith('('):
- (arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
- else:
- (arg, var) = parsePrimitiveCategory(
- PRIM_RE.match(cat_string).groups(), primitives, families, var
- )
- res = FunctionalCategory(res, arg, direction)
- return (res, var)
- def fromstring(lex_str, include_semantics=False):
- """
- Convert string representation into a lexicon for CCGs.
- """
- CCGVar.reset_id()
- primitives = []
- families = {}
- entries = defaultdict(list)
- for line in lex_str.splitlines():
- # Strip comments and leading/trailing whitespace.
- line = COMMENTS_RE.match(line).groups()[0].strip()
- if line == "":
- continue
- if line.startswith(':-'):
- # A line of primitive categories.
- # The first one is the target category
- # ie, :- S, N, NP, VP
- primitives = primitives + [
- prim.strip() for prim in line[2:].strip().split(',')
- ]
- else:
- # Either a family definition, or a word definition
- (ident, sep, rhs) = LEX_RE.match(line).groups()
- (catstr, semantics_str) = RHS_RE.match(rhs).groups()
- (cat, var) = augParseCategory(catstr, primitives, families)
- if sep == '::':
- # Family definition
- # ie, Det :: NP/N
- families[ident] = (cat, var)
- else:
- semantics = None
- if include_semantics is True:
- if semantics_str is None:
- raise AssertionError(
- line
- + " must contain semantics because include_semantics is set to True"
- )
- else:
- semantics = Expression.fromstring(
- SEMANTICS_RE.match(semantics_str).groups()[0]
- )
- # Word definition
- # ie, which => (N\N)/(S/NP)
- entries[ident].append(Token(ident, cat, semantics))
- return CCGLexicon(primitives[0], primitives, families, entries)
- @deprecated('Use fromstring() instead.')
- def parseLexicon(lex_str):
- return fromstring(lex_str)
- openccg_tinytiny = fromstring(
- """
- # Rather minimal lexicon based on the openccg `tinytiny' grammar.
- # Only incorporates a subset of the morphological subcategories, however.
- :- S,NP,N # Primitive categories
- Det :: NP/N # Determiners
- Pro :: NP
- IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular)
- IntransVpl :: S\\NP[pl] # Plural
- TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular)
- TransVpl :: S\\NP[pl]/NP # Plural
- the => NP[sg]/N[sg]
- the => NP[pl]/N[pl]
- I => Pro
- me => Pro
- we => Pro
- us => Pro
- book => N[sg]
- books => N[pl]
- peach => N[sg]
- peaches => N[pl]
- policeman => N[sg]
- policemen => N[pl]
- boy => N[sg]
- boys => N[pl]
- sleep => IntransVsg
- sleep => IntransVpl
- eat => IntransVpl
- eat => TransVpl
- eats => IntransVsg
- eats => TransVsg
- see => TransVpl
- sees => TransVsg
- """
- )
|