lexicon.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. # Natural Language Toolkit: Combinatory Categorial Grammar
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Graeme Gange <ggange@csse.unimelb.edu.au>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. CCG Lexicons
  9. """
  10. from __future__ import unicode_literals
  11. import re
  12. from collections import defaultdict
  13. from nltk.ccg.api import PrimitiveCategory, Direction, CCGVar, FunctionalCategory
  14. from nltk.compat import python_2_unicode_compatible
  15. from nltk.internals import deprecated
  16. from nltk.sem.logic import Expression
  17. # ------------
  18. # Regular expressions used for parsing components of the lexicon
  19. # ------------
  20. # Parses a primitive category and subscripts
  21. PRIM_RE = re.compile(r'''([A-Za-z]+)(\[[A-Za-z,]+\])?''')
  22. # Separates the next primitive category from the remainder of the
  23. # string
  24. NEXTPRIM_RE = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''')
  25. # Separates the next application operator from the remainder
  26. APP_RE = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''')
  27. # Parses the definition of the right-hand side (rhs) of either a word or a family
  28. LEX_RE = re.compile(r'''([\S_]+)\s*(::|[-=]+>)\s*(.+)''', re.UNICODE)
  29. # Parses the right hand side that contains category and maybe semantic predicate
  30. RHS_RE = re.compile(r'''([^{}]*[^ {}])\s*(\{[^}]+\})?''', re.UNICODE)
  31. # Parses the semantic predicate
  32. SEMANTICS_RE = re.compile(r'''\{([^}]+)\}''', re.UNICODE)
  33. # Strips comments from a line
  34. COMMENTS_RE = re.compile('''([^#]*)(?:#.*)?''')
  35. class Token(object):
  36. """
  37. Class representing a token.
  38. token => category {semantics}
  39. e.g. eat => S\\var[pl]/var {\\x y.eat(x,y)}
  40. * `token` (string)
  41. * `categ` (string)
  42. * `semantics` (Expression)
  43. """
  44. def __init__(self, token, categ, semantics=None):
  45. self._token = token
  46. self._categ = categ
  47. self._semantics = semantics
  48. def categ(self):
  49. return self._categ
  50. def semantics(self):
  51. return self._semantics
  52. def __str__(self):
  53. semantics_str = ""
  54. if self._semantics is not None:
  55. semantics_str = " {" + str(self._semantics) + "}"
  56. return "" + str(self._categ) + semantics_str
  57. def __cmp__(self, other):
  58. if not isinstance(other, Token):
  59. return -1
  60. return cmp((self._categ, self._semantics), other.categ(), other.semantics())
  61. @python_2_unicode_compatible
  62. class CCGLexicon(object):
  63. """
  64. Class representing a lexicon for CCG grammars.
  65. * `primitives`: The list of primitive categories for the lexicon
  66. * `families`: Families of categories
  67. * `entries`: A mapping of words to possible categories
  68. """
  69. def __init__(self, start, primitives, families, entries):
  70. self._start = PrimitiveCategory(start)
  71. self._primitives = primitives
  72. self._families = families
  73. self._entries = entries
  74. def categories(self, word):
  75. """
  76. Returns all the possible categories for a word
  77. """
  78. return self._entries[word]
  79. def start(self):
  80. """
  81. Return the target category for the parser
  82. """
  83. return self._start
  84. def __str__(self):
  85. """
  86. String representation of the lexicon. Used for debugging.
  87. """
  88. string = ""
  89. first = True
  90. for ident in sorted(self._entries):
  91. if not first:
  92. string = string + "\n"
  93. string = string + ident + " => "
  94. first = True
  95. for cat in self._entries[ident]:
  96. if not first:
  97. string = string + " | "
  98. else:
  99. first = False
  100. string = string + "%s" % cat
  101. return string
  102. # -----------
  103. # Parsing lexicons
  104. # -----------
  105. def matchBrackets(string):
  106. """
  107. Separate the contents matching the first set of brackets from the rest of
  108. the input.
  109. """
  110. rest = string[1:]
  111. inside = "("
  112. while rest != "" and not rest.startswith(')'):
  113. if rest.startswith('('):
  114. (part, rest) = matchBrackets(rest)
  115. inside = inside + part
  116. else:
  117. inside = inside + rest[0]
  118. rest = rest[1:]
  119. if rest.startswith(')'):
  120. return (inside + ')', rest[1:])
  121. raise AssertionError('Unmatched bracket in string \'' + string + '\'')
  122. def nextCategory(string):
  123. """
  124. Separate the string for the next portion of the category from the rest
  125. of the string
  126. """
  127. if string.startswith('('):
  128. return matchBrackets(string)
  129. return NEXTPRIM_RE.match(string).groups()
  130. def parseApplication(app):
  131. """
  132. Parse an application operator
  133. """
  134. return Direction(app[0], app[1:])
  135. def parseSubscripts(subscr):
  136. """
  137. Parse the subscripts for a primitive category
  138. """
  139. if subscr:
  140. return subscr[1:-1].split(',')
  141. return []
  142. def parsePrimitiveCategory(chunks, primitives, families, var):
  143. """
  144. Parse a primitive category
  145. If the primitive is the special category 'var', replace it with the
  146. correct `CCGVar`.
  147. """
  148. if chunks[0] == "var":
  149. if chunks[1] is None:
  150. if var is None:
  151. var = CCGVar()
  152. return (var, var)
  153. catstr = chunks[0]
  154. if catstr in families:
  155. (cat, cvar) = families[catstr]
  156. if var is None:
  157. var = cvar
  158. else:
  159. cat = cat.substitute([(cvar, var)])
  160. return (cat, var)
  161. if catstr in primitives:
  162. subscrs = parseSubscripts(chunks[1])
  163. return (PrimitiveCategory(catstr, subscrs), var)
  164. raise AssertionError(
  165. 'String \'' + catstr + '\' is neither a family nor primitive category.'
  166. )
  167. def augParseCategory(line, primitives, families, var=None):
  168. """
  169. Parse a string representing a category, and returns a tuple with
  170. (possibly) the CCG variable for the category
  171. """
  172. (cat_string, rest) = nextCategory(line)
  173. if cat_string.startswith('('):
  174. (res, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
  175. else:
  176. # print rePrim.match(str).groups()
  177. (res, var) = parsePrimitiveCategory(
  178. PRIM_RE.match(cat_string).groups(), primitives, families, var
  179. )
  180. while rest != "":
  181. app = APP_RE.match(rest).groups()
  182. direction = parseApplication(app[0:3])
  183. rest = app[3]
  184. (cat_string, rest) = nextCategory(rest)
  185. if cat_string.startswith('('):
  186. (arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
  187. else:
  188. (arg, var) = parsePrimitiveCategory(
  189. PRIM_RE.match(cat_string).groups(), primitives, families, var
  190. )
  191. res = FunctionalCategory(res, arg, direction)
  192. return (res, var)
  193. def fromstring(lex_str, include_semantics=False):
  194. """
  195. Convert string representation into a lexicon for CCGs.
  196. """
  197. CCGVar.reset_id()
  198. primitives = []
  199. families = {}
  200. entries = defaultdict(list)
  201. for line in lex_str.splitlines():
  202. # Strip comments and leading/trailing whitespace.
  203. line = COMMENTS_RE.match(line).groups()[0].strip()
  204. if line == "":
  205. continue
  206. if line.startswith(':-'):
  207. # A line of primitive categories.
  208. # The first one is the target category
  209. # ie, :- S, N, NP, VP
  210. primitives = primitives + [
  211. prim.strip() for prim in line[2:].strip().split(',')
  212. ]
  213. else:
  214. # Either a family definition, or a word definition
  215. (ident, sep, rhs) = LEX_RE.match(line).groups()
  216. (catstr, semantics_str) = RHS_RE.match(rhs).groups()
  217. (cat, var) = augParseCategory(catstr, primitives, families)
  218. if sep == '::':
  219. # Family definition
  220. # ie, Det :: NP/N
  221. families[ident] = (cat, var)
  222. else:
  223. semantics = None
  224. if include_semantics is True:
  225. if semantics_str is None:
  226. raise AssertionError(
  227. line
  228. + " must contain semantics because include_semantics is set to True"
  229. )
  230. else:
  231. semantics = Expression.fromstring(
  232. SEMANTICS_RE.match(semantics_str).groups()[0]
  233. )
  234. # Word definition
  235. # ie, which => (N\N)/(S/NP)
  236. entries[ident].append(Token(ident, cat, semantics))
  237. return CCGLexicon(primitives[0], primitives, families, entries)
  238. @deprecated('Use fromstring() instead.')
  239. def parseLexicon(lex_str):
  240. return fromstring(lex_str)
  241. openccg_tinytiny = fromstring(
  242. """
  243. # Rather minimal lexicon based on the openccg `tinytiny' grammar.
  244. # Only incorporates a subset of the morphological subcategories, however.
  245. :- S,NP,N # Primitive categories
  246. Det :: NP/N # Determiners
  247. Pro :: NP
  248. IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular)
  249. IntransVpl :: S\\NP[pl] # Plural
  250. TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular)
  251. TransVpl :: S\\NP[pl]/NP # Plural
  252. the => NP[sg]/N[sg]
  253. the => NP[pl]/N[pl]
  254. I => Pro
  255. me => Pro
  256. we => Pro
  257. us => Pro
  258. book => N[sg]
  259. books => N[pl]
  260. peach => N[sg]
  261. peaches => N[pl]
  262. policeman => N[sg]
  263. policemen => N[pl]
  264. boy => N[sg]
  265. boys => N[pl]
  266. sleep => IntransVsg
  267. sleep => IntransVpl
  268. eat => IntransVpl
  269. eat => TransVpl
  270. eats => IntransVsg
  271. eats => TransVsg
  272. see => TransVpl
  273. sees => TransVsg
  274. """
  275. )