toolbox.doctest 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. .. Copyright (C) 2001-2019 NLTK Project
  2. .. For license information, see LICENSE.TXT
  3. ===============================
  4. Unit test cases for ``toolbox``
  5. ===============================
  6. >>> from nltk import toolbox
  7. --------------------------
  8. ``toolbox.StandardFormat``
  9. --------------------------
  10. >>> f = toolbox.StandardFormat()
  11. ``toolbox.StandardFormat.open()``
  12. ---------------------------------
  13. >>> import os, tempfile
  14. >>> (fd, fname) = tempfile.mkstemp()
  15. >>> tf = os.fdopen(fd, "w")
  16. >>> _ = tf.write('\\lx a value\n\\lx another value\n')
  17. >>> tf.close()
  18. >>> f = toolbox.StandardFormat()
  19. >>> f.open(fname)
  20. >>> list(f.fields())
  21. [('lx', 'a value'), ('lx', 'another value')]
  22. >>> f.close()
  23. >>> os.unlink(fname)
  24. ``toolbox.StandardFormat.open_string()``
  25. ----------------------------------------
  26. >>> f = toolbox.StandardFormat()
  27. >>> f.open_string('\\lx a value\n\\lx another value\n')
  28. >>> list(f.fields())
  29. [('lx', 'a value'), ('lx', 'another value')]
  30. >>> f.close()
  31. ``toolbox.StandardFormat.close()``
  32. ----------------------------------
  33. >>> f = toolbox.StandardFormat()
  34. >>> f.open_string('\\lx a value\n\\lx another value\n')
  35. >>> list(f.fields())
  36. [('lx', 'a value'), ('lx', 'another value')]
  37. >>> f.close()
  38. ``toolbox.StandardFormat.line_num``
  39. ---------------------------------------
  40. ``StandardFormat.line_num`` contains the line number of the last line returned:
  41. >>> f = toolbox.StandardFormat()
  42. >>> f.open_string('\\lx a value\n\\lx another value\n\\lx a third value\n')
  43. >>> line_nums = []
  44. >>> for l in f.raw_fields():
  45. ... line_nums.append(f.line_num)
  46. >>> line_nums
  47. [1, 2, 3]
  48. ``StandardFormat.line_num`` contains the line number of the last line returned:
  49. >>> f = toolbox.StandardFormat()
  50. >>> f.open_string('\\lx two\nlines\n\\lx three\nlines\n\n\\lx two\nlines\n')
  51. >>> line_nums = []
  52. >>> for l in f.raw_fields():
  53. ... line_nums.append(f.line_num)
  54. >>> line_nums
  55. [2, 5, 7]
  56. ``StandardFormat.line_num`` doesn't exist before openning or after closing
  57. a file or string:
  58. >>> f = toolbox.StandardFormat()
  59. >>> f.line_num
  60. Traceback (most recent call last):
  61. ...
  62. AttributeError: 'StandardFormat' object has no attribute 'line_num'
  63. >>> f.open_string('\\lx two\nlines\n\\lx three\nlines\n\n\\lx two\nlines\n')
  64. >>> line_nums = []
  65. >>> for l in f.raw_fields():
  66. ... line_nums.append(f.line_num)
  67. >>> line_nums
  68. [2, 5, 7]
  69. >>> f.close()
  70. >>> f.line_num
  71. Traceback (most recent call last):
  72. ...
  73. AttributeError: 'StandardFormat' object has no attribute 'line_num'
  74. ``toolbox.StandardFormat.raw_fields()``
  75. ---------------------------------------
  76. ``raw_fields()`` returns an iterator over tuples of two strings representing the
  77. marker and its value. The marker is given without the backslash and the value
  78. without its trailing newline:
  79. >>> f = toolbox.StandardFormat()
  80. >>> f.open_string('\\lx a value\n\\lx another value\n')
  81. >>> list(f.raw_fields())
  82. [('lx', 'a value'), ('lx', 'another value')]
  83. an empty file returns nothing:
  84. >>> f = toolbox.StandardFormat()
  85. >>> f.open_string('')
  86. >>> list(f.raw_fields())
  87. []
  88. file with only a newline returns WHAT SHOULD IT RETURN???:
  89. >>> f = toolbox.StandardFormat()
  90. >>> f.open_string('\n')
  91. >>> list(f.raw_fields())
  92. [(None, '')]
  93. file with only one field should be parsed ok:
  94. >>> f = toolbox.StandardFormat()
  95. >>> f.open_string('\\lx one value\n')
  96. >>> list(f.raw_fields())
  97. [('lx', 'one value')]
  98. file without a trailing newline should be parsed ok:
  99. >>> f = toolbox.StandardFormat()
  100. >>> f.open_string('\\lx a value\n\\lx another value')
  101. >>> list(f.raw_fields())
  102. [('lx', 'a value'), ('lx', 'another value')]
  103. trailing white space is preserved except for the final newline:
  104. >>> f = toolbox.StandardFormat()
  105. >>> f.open_string('\\lx trailing space \n\\lx trailing tab\t\n\\lx extra newline\n\n')
  106. >>> list(f.raw_fields())
  107. [('lx', 'trailing space '), ('lx', 'trailing tab\t'), ('lx', 'extra newline\n')]
  108. line wrapping is preserved:
  109. >>> f = toolbox.StandardFormat()
  110. >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n')
  111. >>> list(f.raw_fields())
  112. [('lx', 'a value\nmore of the value\nand still more'), ('lc', 'another val')]
  113. file beginning with a multiline record should be parsed ok:
  114. >>> f = toolbox.StandardFormat()
  115. >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n')
  116. >>> list(f.raw_fields())
  117. [('lx', 'a value\nmore of the value\nand still more'), ('lc', 'another val')]
  118. file ending with a multiline record should be parsed ok:
  119. >>> f = toolbox.StandardFormat()
  120. >>> f.open_string('\\lc a value\n\\lx another value\nmore of the value\nand still more\n')
  121. >>> list(f.raw_fields())
  122. [('lc', 'a value'), ('lx', 'another value\nmore of the value\nand still more')]
  123. file beginning with a BOM should be parsed ok:
  124. >>> f = toolbox.StandardFormat()
  125. >>> f.open_string('\xef\xbb\xbf\\lx a value\n\\lx another value\n')
  126. >>> list(f.raw_fields())
  127. [('lx', 'a value'), ('lx', 'another value')]
  128. file beginning with two BOMs should ignore only the first one:
  129. >>> f = toolbox.StandardFormat()
  130. >>> f.open_string('\xef\xbb\xbf\xef\xbb\xbf\\lx a value\n\\lx another value\n')
  131. >>> list(f.raw_fields())
  132. [(None, '\xef\xbb\xbf\\lx a value'), ('lx', 'another value')]
  133. should not ignore a BOM not at the beginning of the file:
  134. >>> f = toolbox.StandardFormat()
  135. >>> f.open_string('\\lx a value\n\xef\xbb\xbf\\lx another value\n')
  136. >>> list(f.raw_fields())
  137. [('lx', 'a value\n\xef\xbb\xbf\\lx another value')]
  138. ``toolbox.StandardFormat.fields()``
  139. -----------------------------------
  140. trailing white space is not preserved:
  141. >>> f = toolbox.StandardFormat()
  142. >>> f.open_string('\\lx trailing space \n\\lx trailing tab\t\n\\lx extra newline\n\n')
  143. >>> list(f.fields())
  144. [('lx', 'trailing space'), ('lx', 'trailing tab'), ('lx', 'extra newline')]
  145. multiline fields are unwrapped:
  146. >>> f = toolbox.StandardFormat()
  147. >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n')
  148. >>> list(f.fields())
  149. [('lx', 'a value more of the value and still more'), ('lc', 'another val')]
  150. markers
  151. -------
  152. A backslash in the first position on a new line indicates the start of a
  153. marker. The backslash is not part of the marker:
  154. >>> f = toolbox.StandardFormat()
  155. >>> f.open_string('\\mk a value\n')
  156. >>> list(f.fields())
  157. [('mk', 'a value')]
  158. If the backslash occurs later in the line it does not indicate the start
  159. of a marker:
  160. >>> f = toolbox.StandardFormat()
  161. >>> f.open_string('\\mk a value\n \\mk another one\n')
  162. >>> list(f.raw_fields())
  163. [('mk', 'a value\n \\mk another one')]
  164. There is no specific limit to the length of a marker:
  165. >>> f = toolbox.StandardFormat()
  166. >>> f.open_string('\\this_is_an_extremely_long_marker value\n')
  167. >>> list(f.fields())
  168. [('this_is_an_extremely_long_marker', 'value')]
  169. A marker can contain any non white space character:
  170. >>> f = toolbox.StandardFormat()
  171. >>> f.open_string('\\`~!@#$%^&*()_-=+[{]}\|,<.>/?;:"0123456789 value\n')
  172. >>> list(f.fields())
  173. [('`~!@#$%^&*()_-=+[{]}\\|,<.>/?;:"0123456789', 'value')]
  174. A marker is terminated by any white space character:
  175. >>> f = toolbox.StandardFormat()
  176. >>> f.open_string('\\mk a value\n\\mk\tanother one\n\\mk\rthird one\n\\mk\ffourth one')
  177. >>> list(f.fields())
  178. [('mk', 'a value'), ('mk', 'another one'), ('mk', 'third one'), ('mk', 'fourth one')]
  179. Consecutive whitespace characters (except newline) are treated the same as one:
  180. >>> f = toolbox.StandardFormat()
  181. >>> f.open_string('\\mk \t\r\fa value\n')
  182. >>> list(f.fields())
  183. [('mk', 'a value')]
  184. -----------------------
  185. ``toolbox.ToolboxData``
  186. -----------------------
  187. >>> db = toolbox.ToolboxData()
  188. ``toolbox.ToolboxData.parse()``
  189. -------------------------------
  190. check that normal parsing works:
  191. >>> from xml.etree import ElementTree
  192. >>> td = toolbox.ToolboxData()
  193. >>> s = """\\_sh v3.0 400 Rotokas Dictionary
  194. ... \\_DateStampHasFourDigitYear
  195. ...
  196. ... \\lx kaa
  197. ... \\ps V.A
  198. ... \\ge gag
  199. ... \\gp nek i pas
  200. ...
  201. ... \\lx kaa
  202. ... \\ps V.B
  203. ... \\ge strangle
  204. ... \\gp pasim nek
  205. ... """
  206. >>> td.open_string(s)
  207. >>> tree = td.parse(key='lx')
  208. >>> tree.tag
  209. 'toolbox_data'
  210. >>> ElementTree.tostring(list(tree)[0]).decode('utf8')
  211. '<header><_sh>v3.0 400 Rotokas Dictionary</_sh><_DateStampHasFourDigitYear /></header>'
  212. >>> ElementTree.tostring(list(tree)[1]).decode('utf8')
  213. '<record><lx>kaa</lx><ps>V.A</ps><ge>gag</ge><gp>nek i pas</gp></record>'
  214. >>> ElementTree.tostring(list(tree)[2]).decode('utf8')
  215. '<record><lx>kaa</lx><ps>V.B</ps><ge>strangle</ge><gp>pasim nek</gp></record>'
  216. check that guessing the key marker works:
  217. >>> from xml.etree import ElementTree
  218. >>> td = toolbox.ToolboxData()
  219. >>> s = """\\_sh v3.0 400 Rotokas Dictionary
  220. ... \\_DateStampHasFourDigitYear
  221. ...
  222. ... \\lx kaa
  223. ... \\ps V.A
  224. ... \\ge gag
  225. ... \\gp nek i pas
  226. ...
  227. ... \\lx kaa
  228. ... \\ps V.B
  229. ... \\ge strangle
  230. ... \\gp pasim nek
  231. ... """
  232. >>> td.open_string(s)
  233. >>> tree = td.parse()
  234. >>> ElementTree.tostring(list(tree)[0]).decode('utf8')
  235. '<header><_sh>v3.0 400 Rotokas Dictionary</_sh><_DateStampHasFourDigitYear /></header>'
  236. >>> ElementTree.tostring(list(tree)[1]).decode('utf8')
  237. '<record><lx>kaa</lx><ps>V.A</ps><ge>gag</ge><gp>nek i pas</gp></record>'
  238. >>> ElementTree.tostring(list(tree)[2]).decode('utf8')
  239. '<record><lx>kaa</lx><ps>V.B</ps><ge>strangle</ge><gp>pasim nek</gp></record>'
  240. -----------------------
  241. ``toolbox`` functions
  242. -----------------------
  243. ``toolbox.to_sfm_string()``
  244. -------------------------------