123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- #
- # Natural Language Toolkit: TGrep search
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Will Roberts <wildwilhelm@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- '''
- ============================================
- TGrep search implementation for NLTK trees
- ============================================
- This module supports TGrep2 syntax for matching parts of NLTK Trees.
- Note that many tgrep operators require the tree passed to be a
- ``ParentedTree``.
- External links:
- - `Tgrep tutorial <http://www.stanford.edu/dept/linguistics/corpora/cas-tut-tgrep.html>`_
- - `Tgrep2 manual <http://tedlab.mit.edu/~dr/Tgrep2/tgrep2.pdf>`_
- - `Tgrep2 source <http://tedlab.mit.edu/~dr/Tgrep2/>`_
- Usage
- =====
- >>> from nltk.tree import ParentedTree
- >>> from nltk.tgrep import tgrep_nodes, tgrep_positions
- >>> tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))')
- >>> list(tgrep_nodes('NN', [tree]))
- [[ParentedTree('NN', ['dog']), ParentedTree('NN', ['cat'])]]
- >>> list(tgrep_positions('NN', [tree]))
- [[(0, 2), (2, 1)]]
- >>> list(tgrep_nodes('DT', [tree]))
- [[ParentedTree('DT', ['the']), ParentedTree('DT', ['a'])]]
- >>> list(tgrep_nodes('DT $ JJ', [tree]))
- [[ParentedTree('DT', ['the'])]]
- This implementation adds syntax to select nodes based on their NLTK
- tree position. This syntax is ``N`` plus a Python tuple representing
- the tree position. For instance, ``N()``, ``N(0,)``, ``N(0,0)`` are
- valid node selectors. Example:
- >>> tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))')
- >>> tree[0,0]
- ParentedTree('DT', ['the'])
- >>> tree[0,0].treeposition()
- (0, 0)
- >>> list(tgrep_nodes('N(0,0)', [tree]))
- [[ParentedTree('DT', ['the'])]]
- Caveats:
- ========
- - Link modifiers: "?" and "=" are not implemented.
- - Tgrep compatibility: Using "@" for "!", "{" for "<", "}" for ">" are
- not implemented.
- - The "=" and "~" links are not implemented.
- Known Issues:
- =============
- - There are some issues with link relations involving leaf nodes
- (which are represented as bare strings in NLTK trees). For
- instance, consider the tree::
- (S (A x))
- The search string ``* !>> S`` should select all nodes which are not
- dominated in some way by an ``S`` node (i.e., all nodes which are
- not descendants of an ``S``). Clearly, in this tree, the only node
- which fulfills this criterion is the top node (since it is not
- dominated by anything). However, the code here will find both the
- top node and the leaf node ``x``. This is because we cannot recover
- the parent of the leaf, since it is stored as a bare string.
- A possible workaround, when performing this kind of search, would be
- to filter out all leaf nodes.
- Implementation notes
- ====================
- This implementation is (somewhat awkwardly) based on lambda functions
- which are predicates on a node. A predicate is a function which is
- either True or False; using a predicate function, we can identify sets
- of nodes with particular properties. A predicate function, could, for
- instance, return True only if a particular node has a label matching a
- particular regular expression, and has a daughter node which has no
- sisters. Because tgrep2 search strings can do things statefully (such
- as substituting in macros, and binding nodes with node labels), the
- actual predicate function is declared with three arguments::
- pred = lambda n, m, l: return True # some logic here
- ``n``
- is a node in a tree; this argument must always be given
- ``m``
- contains a dictionary, mapping macro names onto predicate functions
- ``l``
- is a dictionary to map node labels onto nodes in the tree
- ``m`` and ``l`` are declared to default to ``None``, and so need not be
- specified in a call to a predicate. Predicates which call other
- predicates must always pass the value of these arguments on. The
- top-level predicate (constructed by ``_tgrep_exprs_action``) binds the
- macro definitions to ``m`` and initialises ``l`` to an empty dictionary.
- '''
- from __future__ import absolute_import, print_function, unicode_literals
- import functools
- import re
- from six import binary_type, text_type
- try:
- import pyparsing
- except ImportError:
- print('Warning: nltk.tgrep will not work without the `pyparsing` package')
- print('installed.')
- import nltk.tree
- class TgrepException(Exception):
- '''Tgrep exception type.'''
- pass
- def ancestors(node):
- '''
- Returns the list of all nodes dominating the given tree node.
- This method will not work with leaf nodes, since there is no way
- to recover the parent.
- '''
- results = []
- try:
- current = node.parent()
- except AttributeError:
- # if node is a leaf, we cannot retrieve its parent
- return results
- while current:
- results.append(current)
- current = current.parent()
- return results
- def unique_ancestors(node):
- '''
- Returns the list of all nodes dominating the given node, where
- there is only a single path of descent.
- '''
- results = []
- try:
- current = node.parent()
- except AttributeError:
- # if node is a leaf, we cannot retrieve its parent
- return results
- while current and len(current) == 1:
- results.append(current)
- current = current.parent()
- return results
- def _descendants(node):
- '''
- Returns the list of all nodes which are descended from the given
- tree node in some way.
- '''
- try:
- treepos = node.treepositions()
- except AttributeError:
- return []
- return [node[x] for x in treepos[1:]]
- def _leftmost_descendants(node):
- '''
- Returns the set of all nodes descended in some way through
- left branches from this node.
- '''
- try:
- treepos = node.treepositions()
- except AttributeError:
- return []
- return [node[x] for x in treepos[1:] if all(y == 0 for y in x)]
- def _rightmost_descendants(node):
- '''
- Returns the set of all nodes descended in some way through
- right branches from this node.
- '''
- try:
- rightmost_leaf = max(node.treepositions())
- except AttributeError:
- return []
- return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)]
- def _istree(obj):
- '''Predicate to check whether `obj` is a nltk.tree.Tree.'''
- return isinstance(obj, nltk.tree.Tree)
- def _unique_descendants(node):
- '''
- Returns the list of all nodes descended from the given node, where
- there is only a single path of descent.
- '''
- results = []
- current = node
- while current and _istree(current) and len(current) == 1:
- current = current[0]
- results.append(current)
- return results
- def _before(node):
- '''
- Returns the set of all nodes that are before the given node.
- '''
- try:
- pos = node.treeposition()
- tree = node.root()
- except AttributeError:
- return []
- return [tree[x] for x in tree.treepositions() if x[: len(pos)] < pos[: len(x)]]
- def _immediately_before(node):
- '''
- Returns the set of all nodes that are immediately before the given
- node.
- Tree node A immediately precedes node B if the last terminal
- symbol (word) produced by A immediately precedes the first
- terminal symbol produced by B.
- '''
- try:
- pos = node.treeposition()
- tree = node.root()
- except AttributeError:
- return []
- # go "upwards" from pos until there is a place we can go to the left
- idx = len(pos) - 1
- while 0 <= idx and pos[idx] == 0:
- idx -= 1
- if idx < 0:
- return []
- pos = list(pos[: idx + 1])
- pos[-1] -= 1
- before = tree[pos]
- return [before] + _rightmost_descendants(before)
- def _after(node):
- '''
- Returns the set of all nodes that are after the given node.
- '''
- try:
- pos = node.treeposition()
- tree = node.root()
- except AttributeError:
- return []
- return [tree[x] for x in tree.treepositions() if x[: len(pos)] > pos[: len(x)]]
- def _immediately_after(node):
- '''
- Returns the set of all nodes that are immediately after the given
- node.
- Tree node A immediately follows node B if the first terminal
- symbol (word) produced by A immediately follows the last
- terminal symbol produced by B.
- '''
- try:
- pos = node.treeposition()
- tree = node.root()
- current = node.parent()
- except AttributeError:
- return []
- # go "upwards" from pos until there is a place we can go to the
- # right
- idx = len(pos) - 1
- while 0 <= idx and pos[idx] == len(current) - 1:
- idx -= 1
- current = current.parent()
- if idx < 0:
- return []
- pos = list(pos[: idx + 1])
- pos[-1] += 1
- after = tree[pos]
- return [after] + _leftmost_descendants(after)
- def _tgrep_node_literal_value(node):
- '''
- Gets the string value of a given parse tree node, for comparison
- using the tgrep node literal predicates.
- '''
- return node.label() if _istree(node) else text_type(node)
- def _tgrep_macro_use_action(_s, _l, tokens):
- '''
- Builds a lambda function which looks up the macro name used.
- '''
- assert len(tokens) == 1
- assert tokens[0][0] == '@'
- macro_name = tokens[0][1:]
- def macro_use(n, m=None, l=None):
- if m is None or macro_name not in m:
- raise TgrepException('macro {0} not defined'.format(macro_name))
- return m[macro_name](n, m, l)
- return macro_use
- def _tgrep_node_action(_s, _l, tokens):
- '''
- Builds a lambda function representing a predicate on a tree node
- depending on the name of its node.
- '''
- # print 'node tokens: ', tokens
- if tokens[0] == "'":
- # strip initial apostrophe (tgrep2 print command)
- tokens = tokens[1:]
- if len(tokens) > 1:
- # disjunctive definition of a node name
- assert list(set(tokens[1::2])) == ['|']
- # recursively call self to interpret each node name definition
- tokens = [_tgrep_node_action(None, None, [node]) for node in tokens[::2]]
- # capture tokens and return the disjunction
- return (lambda t: lambda n, m=None, l=None: any(f(n, m, l) for f in t))(tokens)
- else:
- if hasattr(tokens[0], '__call__'):
- # this is a previously interpreted parenthetical node
- # definition (lambda function)
- return tokens[0]
- elif tokens[0] == '*' or tokens[0] == '__':
- return lambda n, m=None, l=None: True
- elif tokens[0].startswith('"'):
- assert tokens[0].endswith('"')
- node_lit = tokens[0][1:-1].replace('\\"', '"').replace('\\\\', '\\')
- return (
- lambda s: lambda n, m=None, l=None: _tgrep_node_literal_value(n) == s
- )(node_lit)
- elif tokens[0].startswith('/'):
- assert tokens[0].endswith('/')
- node_lit = tokens[0][1:-1]
- return (
- lambda r: lambda n, m=None, l=None: r.search(
- _tgrep_node_literal_value(n)
- )
- )(re.compile(node_lit))
- elif tokens[0].startswith('i@'):
- node_func = _tgrep_node_action(_s, _l, [tokens[0][2:].lower()])
- return (
- lambda f: lambda n, m=None, l=None: f(
- _tgrep_node_literal_value(n).lower()
- )
- )(node_func)
- else:
- return (
- lambda s: lambda n, m=None, l=None: _tgrep_node_literal_value(n) == s
- )(tokens[0])
- def _tgrep_parens_action(_s, _l, tokens):
- '''
- Builds a lambda function representing a predicate on a tree node
- from a parenthetical notation.
- '''
- # print 'parenthetical tokens: ', tokens
- assert len(tokens) == 3
- assert tokens[0] == '('
- assert tokens[2] == ')'
- return tokens[1]
- def _tgrep_nltk_tree_pos_action(_s, _l, tokens):
- '''
- Builds a lambda function representing a predicate on a tree node
- which returns true if the node is located at a specific tree
- position.
- '''
- # recover the tuple from the parsed sting
- node_tree_position = tuple(int(x) for x in tokens if x.isdigit())
- # capture the node's tree position
- return (
- lambda i: lambda n, m=None, l=None: (
- hasattr(n, 'treeposition') and n.treeposition() == i
- )
- )(node_tree_position)
- def _tgrep_relation_action(_s, _l, tokens):
- '''
- Builds a lambda function representing a predicate on a tree node
- depending on its relation to other nodes in the tree.
- '''
- # print 'relation tokens: ', tokens
- # process negation first if needed
- negated = False
- if tokens[0] == '!':
- negated = True
- tokens = tokens[1:]
- if tokens[0] == '[':
- # process square-bracketed relation expressions
- assert len(tokens) == 3
- assert tokens[2] == ']'
- retval = tokens[1]
- else:
- # process operator-node relation expressions
- assert len(tokens) == 2
- operator, predicate = tokens
- # A < B A is the parent of (immediately dominates) B.
- if operator == '<':
- retval = lambda n, m=None, l=None: (
- _istree(n) and any(predicate(x, m, l) for x in n)
- )
- # A > B A is the child of B.
- elif operator == '>':
- retval = lambda n, m=None, l=None: (
- hasattr(n, 'parent')
- and bool(n.parent())
- and predicate(n.parent(), m, l)
- )
- # A <, B Synonymous with A <1 B.
- elif operator == '<,' or operator == '<1':
- retval = lambda n, m=None, l=None: (
- _istree(n) and bool(list(n)) and predicate(n[0], m, l)
- )
- # A >, B Synonymous with A >1 B.
- elif operator == '>,' or operator == '>1':
- retval = lambda n, m=None, l=None: (
- hasattr(n, 'parent')
- and bool(n.parent())
- and (n is n.parent()[0])
- and predicate(n.parent(), m, l)
- )
- # A <N B B is the Nth child of A (the first child is <1).
- elif operator[0] == '<' and operator[1:].isdigit():
- idx = int(operator[1:])
- # capture the index parameter
- retval = (
- lambda i: lambda n, m=None, l=None: (
- _istree(n)
- and bool(list(n))
- and 0 <= i < len(n)
- and predicate(n[i], m, l)
- )
- )(idx - 1)
- # A >N B A is the Nth child of B (the first child is >1).
- elif operator[0] == '>' and operator[1:].isdigit():
- idx = int(operator[1:])
- # capture the index parameter
- retval = (
- lambda i: lambda n, m=None, l=None: (
- hasattr(n, 'parent')
- and bool(n.parent())
- and 0 <= i < len(n.parent())
- and (n is n.parent()[i])
- and predicate(n.parent(), m, l)
- )
- )(idx - 1)
- # A <' B B is the last child of A (also synonymous with A <-1 B).
- # A <- B B is the last child of A (synonymous with A <-1 B).
- elif operator == '<\'' or operator == '<-' or operator == '<-1':
- retval = lambda n, m=None, l=None: (
- _istree(n) and bool(list(n)) and predicate(n[-1], m, l)
- )
- # A >' B A is the last child of B (also synonymous with A >-1 B).
- # A >- B A is the last child of B (synonymous with A >-1 B).
- elif operator == '>\'' or operator == '>-' or operator == '>-1':
- retval = lambda n, m=None, l=None: (
- hasattr(n, 'parent')
- and bool(n.parent())
- and (n is n.parent()[-1])
- and predicate(n.parent(), m, l)
- )
- # A <-N B B is the N th-to-last child of A (the last child is <-1).
- elif operator[:2] == '<-' and operator[2:].isdigit():
- idx = -int(operator[2:])
- # capture the index parameter
- retval = (
- lambda i: lambda n, m=None, l=None: (
- _istree(n)
- and bool(list(n))
- and 0 <= (i + len(n)) < len(n)
- and predicate(n[i + len(n)], m, l)
- )
- )(idx)
- # A >-N B A is the N th-to-last child of B (the last child is >-1).
- elif operator[:2] == '>-' and operator[2:].isdigit():
- idx = -int(operator[2:])
- # capture the index parameter
- retval = (
- lambda i: lambda n, m=None, l=None: (
- hasattr(n, 'parent')
- and bool(n.parent())
- and 0 <= (i + len(n.parent())) < len(n.parent())
- and (n is n.parent()[i + len(n.parent())])
- and predicate(n.parent(), m, l)
- )
- )(idx)
- # A <: B B is the only child of A
- elif operator == '<:':
- retval = lambda n, m=None, l=None: (
- _istree(n) and len(n) == 1 and predicate(n[0], m, l)
- )
- # A >: B A is the only child of B.
- elif operator == '>:':
- retval = lambda n, m=None, l=None: (
- hasattr(n, 'parent')
- and bool(n.parent())
- and len(n.parent()) == 1
- and predicate(n.parent(), m, l)
- )
- # A << B A dominates B (A is an ancestor of B).
- elif operator == '<<':
- retval = lambda n, m=None, l=None: (
- _istree(n) and any(predicate(x, m, l) for x in _descendants(n))
- )
- # A >> B A is dominated by B (A is a descendant of B).
- elif operator == '>>':
- retval = lambda n, m=None, l=None: any(
- predicate(x, m, l) for x in ancestors(n)
- )
- # A <<, B B is a left-most descendant of A.
- elif operator == '<<,' or operator == '<<1':
- retval = lambda n, m=None, l=None: (
- _istree(n) and any(predicate(x, m, l) for x in _leftmost_descendants(n))
- )
- # A >>, B A is a left-most descendant of B.
- elif operator == '>>,':
- retval = lambda n, m=None, l=None: any(
- (predicate(x, m, l) and n in _leftmost_descendants(x))
- for x in ancestors(n)
- )
- # A <<' B B is a right-most descendant of A.
- elif operator == '<<\'':
- retval = lambda n, m=None, l=None: (
- _istree(n)
- and any(predicate(x, m, l) for x in _rightmost_descendants(n))
- )
- # A >>' B A is a right-most descendant of B.
- elif operator == '>>\'':
- retval = lambda n, m=None, l=None: any(
- (predicate(x, m, l) and n in _rightmost_descendants(x))
- for x in ancestors(n)
- )
- # A <<: B There is a single path of descent from A and B is on it.
- elif operator == '<<:':
- retval = lambda n, m=None, l=None: (
- _istree(n) and any(predicate(x, m, l) for x in _unique_descendants(n))
- )
- # A >>: B There is a single path of descent from B and A is on it.
- elif operator == '>>:':
- retval = lambda n, m=None, l=None: any(
- predicate(x, m, l) for x in unique_ancestors(n)
- )
- # A . B A immediately precedes B.
- elif operator == '.':
- retval = lambda n, m=None, l=None: any(
- predicate(x, m, l) for x in _immediately_after(n)
- )
- # A , B A immediately follows B.
- elif operator == ',':
- retval = lambda n, m=None, l=None: any(
- predicate(x, m, l) for x in _immediately_before(n)
- )
- # A .. B A precedes B.
- elif operator == '..':
- retval = lambda n, m=None, l=None: any(
- predicate(x, m, l) for x in _after(n)
- )
- # A ,, B A follows B.
- elif operator == ',,':
- retval = lambda n, m=None, l=None: any(
- predicate(x, m, l) for x in _before(n)
- )
- # A $ B A is a sister of B (and A != B).
- elif operator == '$' or operator == '%':
- retval = lambda n, m=None, l=None: (
- hasattr(n, 'parent')
- and bool(n.parent())
- and any(predicate(x, m, l) for x in n.parent() if x is not n)
- )
- # A $. B A is a sister of and immediately precedes B.
- elif operator == '$.' or operator == '%.':
- retval = lambda n, m=None, l=None: (
- hasattr(n, 'right_sibling')
- and bool(n.right_sibling())
- and predicate(n.right_sibling(), m, l)
- )
- # A $, B A is a sister of and immediately follows B.
- elif operator == '$,' or operator == '%,':
- retval = lambda n, m=None, l=None: (
- hasattr(n, 'left_sibling')
- and bool(n.left_sibling())
- and predicate(n.left_sibling(), m, l)
- )
- # A $.. B A is a sister of and precedes B.
- elif operator == '$..' or operator == '%..':
- retval = lambda n, m=None, l=None: (
- hasattr(n, 'parent')
- and hasattr(n, 'parent_index')
- and bool(n.parent())
- and any(predicate(x, m, l) for x in n.parent()[n.parent_index() + 1 :])
- )
- # A $,, B A is a sister of and follows B.
- elif operator == '$,,' or operator == '%,,':
- retval = lambda n, m=None, l=None: (
- hasattr(n, 'parent')
- and hasattr(n, 'parent_index')
- and bool(n.parent())
- and any(predicate(x, m, l) for x in n.parent()[: n.parent_index()])
- )
- else:
- raise TgrepException(
- 'cannot interpret tgrep operator "{0}"'.format(operator)
- )
- # now return the built function
- if negated:
- return (lambda r: (lambda n, m=None, l=None: not r(n, m, l)))(retval)
- else:
- return retval
- def _tgrep_conjunction_action(_s, _l, tokens, join_char='&'):
- '''
- Builds a lambda function representing a predicate on a tree node
- from the conjunction of several other such lambda functions.
- This is prototypically called for expressions like
- (`tgrep_rel_conjunction`)::
- < NP & < AP < VP
- where tokens is a list of predicates representing the relations
- (`< NP`, `< AP`, and `< VP`), possibly with the character `&`
- included (as in the example here).
- This is also called for expressions like (`tgrep_node_expr2`)::
- NP < NN
- S=s < /NP/=n : s < /VP/=v : n .. v
- tokens[0] is a tgrep_expr predicate; tokens[1:] are an (optional)
- list of segmented patterns (`tgrep_expr_labeled`, processed by
- `_tgrep_segmented_pattern_action`).
- '''
- # filter out the ampersand
- tokens = [x for x in tokens if x != join_char]
- # print 'relation conjunction tokens: ', tokens
- if len(tokens) == 1:
- return tokens[0]
- else:
- return (
- lambda ts: lambda n, m=None, l=None: all(
- predicate(n, m, l) for predicate in ts
- )
- )(tokens)
- def _tgrep_segmented_pattern_action(_s, _l, tokens):
- '''
- Builds a lambda function representing a segmented pattern.
- Called for expressions like (`tgrep_expr_labeled`)::
- =s .. =v < =n
- This is a segmented pattern, a tgrep2 expression which begins with
- a node label.
- The problem is that for segemented_pattern_action (': =v < =s'),
- the first element (in this case, =v) is specifically selected by
- virtue of matching a particular node in the tree; to retrieve
- the node, we need the label, not a lambda function. For node
- labels inside a tgrep_node_expr, we need a lambda function which
- returns true if the node visited is the same as =v.
- We solve this by creating two copies of a node_label_use in the
- grammar; the label use inside a tgrep_expr_labeled has a separate
- parse action to the pred use inside a node_expr. See
- `_tgrep_node_label_use_action` and
- `_tgrep_node_label_pred_use_action`.
- '''
- # tokens[0] is a string containing the node label
- node_label = tokens[0]
- # tokens[1:] is an (optional) list of predicates which must all
- # hold of the bound node
- reln_preds = tokens[1:]
- def pattern_segment_pred(n, m=None, l=None):
- '''This predicate function ignores its node argument.'''
- # look up the bound node using its label
- if l is None or node_label not in l:
- raise TgrepException(
- 'node_label ={0} not bound in pattern'.format(node_label)
- )
- node = l[node_label]
- # match the relation predicates against the node
- return all(pred(node, m, l) for pred in reln_preds)
- return pattern_segment_pred
- def _tgrep_node_label_use_action(_s, _l, tokens):
- '''
- Returns the node label used to begin a tgrep_expr_labeled. See
- `_tgrep_segmented_pattern_action`.
- Called for expressions like (`tgrep_node_label_use`)::
- =s
- when they appear as the first element of a `tgrep_expr_labeled`
- expression (see `_tgrep_segmented_pattern_action`).
- It returns the node label.
- '''
- assert len(tokens) == 1
- assert tokens[0].startswith('=')
- return tokens[0][1:]
- def _tgrep_node_label_pred_use_action(_s, _l, tokens):
- '''
- Builds a lambda function representing a predicate on a tree node
- which describes the use of a previously bound node label.
- Called for expressions like (`tgrep_node_label_use_pred`)::
- =s
- when they appear inside a tgrep_node_expr (for example, inside a
- relation). The predicate returns true if and only if its node
- argument is identical the the node looked up in the node label
- dictionary using the node's label.
- '''
- assert len(tokens) == 1
- assert tokens[0].startswith('=')
- node_label = tokens[0][1:]
- def node_label_use_pred(n, m=None, l=None):
- # look up the bound node using its label
- if l is None or node_label not in l:
- raise TgrepException(
- 'node_label ={0} not bound in pattern'.format(node_label)
- )
- node = l[node_label]
- # truth means the given node is this node
- return n is node
- return node_label_use_pred
- def _tgrep_bind_node_label_action(_s, _l, tokens):
- '''
- Builds a lambda function representing a predicate on a tree node
- which can optionally bind a matching node into the tgrep2 string's
- label_dict.
- Called for expressions like (`tgrep_node_expr2`)::
- /NP/
- @NP=n
- '''
- # tokens[0] is a tgrep_node_expr
- if len(tokens) == 1:
- return tokens[0]
- else:
- # if present, tokens[1] is the character '=', and tokens[2] is
- # a tgrep_node_label, a string value containing the node label
- assert len(tokens) == 3
- assert tokens[1] == '='
- node_pred = tokens[0]
- node_label = tokens[2]
- def node_label_bind_pred(n, m=None, l=None):
- if node_pred(n, m, l):
- # bind `n` into the dictionary `l`
- if l is None:
- raise TgrepException(
- 'cannot bind node_label {0}: label_dict is None'.format(
- node_label
- )
- )
- l[node_label] = n
- return True
- else:
- return False
- return node_label_bind_pred
- def _tgrep_rel_disjunction_action(_s, _l, tokens):
- '''
- Builds a lambda function representing a predicate on a tree node
- from the disjunction of several other such lambda functions.
- '''
- # filter out the pipe
- tokens = [x for x in tokens if x != '|']
- # print 'relation disjunction tokens: ', tokens
- if len(tokens) == 1:
- return tokens[0]
- elif len(tokens) == 2:
- return (lambda a, b: lambda n, m=None, l=None: a(n, m, l) or b(n, m, l))(
- tokens[0], tokens[1]
- )
- def _macro_defn_action(_s, _l, tokens):
- '''
- Builds a dictionary structure which defines the given macro.
- '''
- assert len(tokens) == 3
- assert tokens[0] == '@'
- return {tokens[1]: tokens[2]}
- def _tgrep_exprs_action(_s, _l, tokens):
- '''
- This is the top-lebel node in a tgrep2 search string; the
- predicate function it returns binds together all the state of a
- tgrep2 search string.
- Builds a lambda function representing a predicate on a tree node
- from the disjunction of several tgrep expressions. Also handles
- macro definitions and macro name binding, and node label
- definitions and node label binding.
- '''
- if len(tokens) == 1:
- return lambda n, m=None, l=None: tokens[0](n, None, {})
- # filter out all the semicolons
- tokens = [x for x in tokens if x != ';']
- # collect all macro definitions
- macro_dict = {}
- macro_defs = [tok for tok in tokens if isinstance(tok, dict)]
- for macro_def in macro_defs:
- macro_dict.update(macro_def)
- # collect all tgrep expressions
- tgrep_exprs = [tok for tok in tokens if not isinstance(tok, dict)]
- # create a new scope for the node label dictionary
- def top_level_pred(n, m=macro_dict, l=None):
- label_dict = {}
- # bind macro definitions and OR together all tgrep_exprs
- return any(predicate(n, m, label_dict) for predicate in tgrep_exprs)
- return top_level_pred
- def _build_tgrep_parser(set_parse_actions=True):
- '''
- Builds a pyparsing-based parser object for tokenizing and
- interpreting tgrep search strings.
- '''
- tgrep_op = pyparsing.Optional('!') + pyparsing.Regex('[$%,.<>][%,.<>0-9-\':]*')
- tgrep_qstring = pyparsing.QuotedString(
- quoteChar='"', escChar='\\', unquoteResults=False
- )
- tgrep_node_regex = pyparsing.QuotedString(
- quoteChar='/', escChar='\\', unquoteResults=False
- )
- tgrep_qstring_icase = pyparsing.Regex('i@\\"(?:[^"\\n\\r\\\\]|(?:\\\\.))*\\"')
- tgrep_node_regex_icase = pyparsing.Regex('i@\\/(?:[^/\\n\\r\\\\]|(?:\\\\.))*\\/')
- tgrep_node_literal = pyparsing.Regex('[^][ \r\t\n;:.,&|<>()$!@%\'^=]+')
- tgrep_expr = pyparsing.Forward()
- tgrep_relations = pyparsing.Forward()
- tgrep_parens = pyparsing.Literal('(') + tgrep_expr + ')'
- tgrep_nltk_tree_pos = (
- pyparsing.Literal('N(')
- + pyparsing.Optional(
- pyparsing.Word(pyparsing.nums)
- + ','
- + pyparsing.Optional(
- pyparsing.delimitedList(pyparsing.Word(pyparsing.nums), delim=',')
- + pyparsing.Optional(',')
- )
- )
- + ')'
- )
- tgrep_node_label = pyparsing.Regex('[A-Za-z0-9]+')
- tgrep_node_label_use = pyparsing.Combine('=' + tgrep_node_label)
- # see _tgrep_segmented_pattern_action
- tgrep_node_label_use_pred = tgrep_node_label_use.copy()
- macro_name = pyparsing.Regex('[^];:.,&|<>()[$!@%\'^=\r\t\n ]+')
- macro_name.setWhitespaceChars('')
- macro_use = pyparsing.Combine('@' + macro_name)
- tgrep_node_expr = (
- tgrep_node_label_use_pred
- | macro_use
- | tgrep_nltk_tree_pos
- | tgrep_qstring_icase
- | tgrep_node_regex_icase
- | tgrep_qstring
- | tgrep_node_regex
- | '*'
- | tgrep_node_literal
- )
- tgrep_node_expr2 = (
- tgrep_node_expr
- + pyparsing.Literal('=').setWhitespaceChars('')
- + tgrep_node_label.copy().setWhitespaceChars('')
- ) | tgrep_node_expr
- tgrep_node = tgrep_parens | (
- pyparsing.Optional("'")
- + tgrep_node_expr2
- + pyparsing.ZeroOrMore("|" + tgrep_node_expr)
- )
- tgrep_brackets = pyparsing.Optional('!') + '[' + tgrep_relations + ']'
- tgrep_relation = tgrep_brackets | (tgrep_op + tgrep_node)
- tgrep_rel_conjunction = pyparsing.Forward()
- tgrep_rel_conjunction << (
- tgrep_relation
- + pyparsing.ZeroOrMore(pyparsing.Optional('&') + tgrep_rel_conjunction)
- )
- tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore(
- "|" + tgrep_relations
- )
- tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations)
- tgrep_expr_labeled = tgrep_node_label_use + pyparsing.Optional(tgrep_relations)
- tgrep_expr2 = tgrep_expr + pyparsing.ZeroOrMore(':' + tgrep_expr_labeled)
- macro_defn = (
- pyparsing.Literal('@') + pyparsing.White().suppress() + macro_name + tgrep_expr2
- )
- tgrep_exprs = (
- pyparsing.Optional(macro_defn + pyparsing.ZeroOrMore(';' + macro_defn) + ';')
- + tgrep_expr2
- + pyparsing.ZeroOrMore(';' + (macro_defn | tgrep_expr2))
- + pyparsing.ZeroOrMore(';').suppress()
- )
- if set_parse_actions:
- tgrep_node_label_use.setParseAction(_tgrep_node_label_use_action)
- tgrep_node_label_use_pred.setParseAction(_tgrep_node_label_pred_use_action)
- macro_use.setParseAction(_tgrep_macro_use_action)
- tgrep_node.setParseAction(_tgrep_node_action)
- tgrep_node_expr2.setParseAction(_tgrep_bind_node_label_action)
- tgrep_parens.setParseAction(_tgrep_parens_action)
- tgrep_nltk_tree_pos.setParseAction(_tgrep_nltk_tree_pos_action)
- tgrep_relation.setParseAction(_tgrep_relation_action)
- tgrep_rel_conjunction.setParseAction(_tgrep_conjunction_action)
- tgrep_relations.setParseAction(_tgrep_rel_disjunction_action)
- macro_defn.setParseAction(_macro_defn_action)
- # the whole expression is also the conjunction of two
- # predicates: the first node predicate, and the remaining
- # relation predicates
- tgrep_expr.setParseAction(_tgrep_conjunction_action)
- tgrep_expr_labeled.setParseAction(_tgrep_segmented_pattern_action)
- tgrep_expr2.setParseAction(
- functools.partial(_tgrep_conjunction_action, join_char=':')
- )
- tgrep_exprs.setParseAction(_tgrep_exprs_action)
- return tgrep_exprs.ignore('#' + pyparsing.restOfLine)
- def tgrep_tokenize(tgrep_string):
- '''
- Tokenizes a TGrep search string into separate tokens.
- '''
- parser = _build_tgrep_parser(False)
- if isinstance(tgrep_string, binary_type):
- tgrep_string = tgrep_string.decode()
- return list(parser.parseString(tgrep_string))
- def tgrep_compile(tgrep_string):
- '''
- Parses (and tokenizes, if necessary) a TGrep search string into a
- lambda function.
- '''
- parser = _build_tgrep_parser(True)
- if isinstance(tgrep_string, binary_type):
- tgrep_string = tgrep_string.decode()
- return list(parser.parseString(tgrep_string, parseAll=True))[0]
- def treepositions_no_leaves(tree):
- '''
- Returns all the tree positions in the given tree which are not
- leaf nodes.
- '''
- treepositions = tree.treepositions()
- # leaves are treeposition tuples that are not prefixes of any
- # other treeposition
- prefixes = set()
- for pos in treepositions:
- for length in range(len(pos)):
- prefixes.add(pos[:length])
- return [pos for pos in treepositions if pos in prefixes]
- def tgrep_positions(pattern, trees, search_leaves=True):
- """
- Return the tree positions in the trees which match the given pattern.
- :param pattern: a tgrep search pattern
- :type pattern: str or output of tgrep_compile()
- :param trees: a sequence of NLTK trees (usually ParentedTrees)
- :type trees: iter(ParentedTree) or iter(Tree)
- :param search_leaves: whether ot return matching leaf nodes
- :type search_leaves: bool
- :rtype: iter(tree positions)
- """
- if isinstance(pattern, (binary_type, text_type)):
- pattern = tgrep_compile(pattern)
- for tree in trees:
- try:
- if search_leaves:
- positions = tree.treepositions()
- else:
- positions = treepositions_no_leaves(tree)
- yield [position for position in positions if pattern(tree[position])]
- except AttributeError:
- yield []
- def tgrep_nodes(pattern, trees, search_leaves=True):
- """
- Return the tree nodes in the trees which match the given pattern.
- :param pattern: a tgrep search pattern
- :type pattern: str or output of tgrep_compile()
- :param trees: a sequence of NLTK trees (usually ParentedTrees)
- :type trees: iter(ParentedTree) or iter(Tree)
- :param search_leaves: whether ot return matching leaf nodes
- :type search_leaves: bool
- :rtype: iter(tree nodes)
- """
- if isinstance(pattern, (binary_type, text_type)):
- pattern = tgrep_compile(pattern)
- for tree in trees:
- try:
- if search_leaves:
- positions = tree.treepositions()
- else:
- positions = treepositions_no_leaves(tree)
- yield [tree[position] for position in positions if pattern(tree[position])]
- except AttributeError:
- yield []
|