common.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Twitter client
  3. #
  4. # Copyright (C) 2001-2019 NLTK Project
  5. # Author: Ewan Klein <ewan@inf.ed.ac.uk>
  6. # Lorenzo Rubio <lrnzcig@gmail.com>
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """
  10. Utility functions for the :module:`twitterclient` module which do not require
  11. the `twython` library to have been installed.
  12. """
  13. from __future__ import print_function
  14. import csv
  15. import gzip
  16. import json
  17. from nltk import compat
  18. HIER_SEPARATOR = "."
  19. def extract_fields(tweet, fields):
  20. """
  21. Extract field values from a full tweet and return them as a list
  22. :param json tweet: The tweet in JSON format
  23. :param list fields: The fields to be extracted from the tweet
  24. :rtype: list(str)
  25. """
  26. out = []
  27. for field in fields:
  28. try:
  29. _add_field_to_out(tweet, field, out)
  30. except TypeError:
  31. raise RuntimeError(
  32. 'Fatal error when extracting fields. Cannot find field ', field
  33. )
  34. return out
  35. def _add_field_to_out(json, field, out):
  36. if _is_composed_key(field):
  37. key, value = _get_key_value_composed(field)
  38. _add_field_to_out(json[key], value, out)
  39. else:
  40. out += [json[field]]
  41. def _is_composed_key(field):
  42. if HIER_SEPARATOR in field:
  43. return True
  44. return False
  45. def _get_key_value_composed(field):
  46. out = field.split(HIER_SEPARATOR)
  47. # there could be up to 3 levels
  48. key = out[0]
  49. value = HIER_SEPARATOR.join(out[1:])
  50. return key, value
  51. def _get_entity_recursive(json, entity):
  52. if not json:
  53. return None
  54. elif isinstance(json, dict):
  55. for key, value in json.items():
  56. if key == entity:
  57. return value
  58. # 'entities' and 'extended_entities' are wrappers in Twitter json
  59. # structure that contain other Twitter objects. See:
  60. # https://dev.twitter.com/overview/api/entities-in-twitter-objects
  61. if key == 'entities' or key == 'extended_entities':
  62. candidate = _get_entity_recursive(value, entity)
  63. if candidate is not None:
  64. return candidate
  65. return None
  66. elif isinstance(json, list):
  67. for item in json:
  68. candidate = _get_entity_recursive(item, entity)
  69. if candidate is not None:
  70. return candidate
  71. return None
  72. else:
  73. return None
  74. def json2csv(
  75. fp, outfile, fields, encoding='utf8', errors='replace', gzip_compress=False
  76. ):
  77. """
  78. Extract selected fields from a file of line-separated JSON tweets and
  79. write to a file in CSV format.
  80. This utility function allows a file of full tweets to be easily converted
  81. to a CSV file for easier processing. For example, just TweetIDs or
  82. just the text content of the Tweets can be extracted.
  83. Additionally, the function allows combinations of fields of other Twitter
  84. objects (mainly the users, see below).
  85. For Twitter entities (e.g. hashtags of a Tweet), and for geolocation, see
  86. `json2csv_entities`
  87. :param str infile: The name of the file containing full tweets
  88. :param str outfile: The name of the text file where results should be\
  89. written
  90. :param list fields: The list of fields to be extracted. Useful examples\
  91. are 'id_str' for the tweetID and 'text' for the text of the tweet. See\
  92. <https://dev.twitter.com/overview/api/tweets> for a full list of fields.\
  93. e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']\
  94. Additionally, it allows IDs from other Twitter objects, e. g.,\
  95. ['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count']
  96. :param error: Behaviour for encoding errors, see\
  97. https://docs.python.org/3/library/codecs.html#codec-base-classes
  98. :param gzip_compress: if `True`, output files are compressed with gzip
  99. """
  100. (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
  101. # write the list of fields as header
  102. writer.writerow(fields)
  103. # process the file
  104. for line in fp:
  105. tweet = json.loads(line)
  106. row = extract_fields(tweet, fields)
  107. writer.writerow(row)
  108. outf.close()
  109. def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
  110. """
  111. Identify appropriate CSV writer given the Python version
  112. """
  113. if compat.PY3:
  114. if gzip_compress:
  115. outf = gzip.open(outfile, 'wt', encoding=encoding, errors=errors)
  116. else:
  117. outf = open(outfile, 'w', encoding=encoding, errors=errors)
  118. writer = csv.writer(outf)
  119. else:
  120. if gzip_compress:
  121. outf = gzip.open(outfile, 'wb')
  122. else:
  123. outf = open(outfile, 'wb')
  124. writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
  125. return (writer, outf)
  126. def json2csv_entities(
  127. tweets_file,
  128. outfile,
  129. main_fields,
  130. entity_type,
  131. entity_fields,
  132. encoding='utf8',
  133. errors='replace',
  134. gzip_compress=False,
  135. ):
  136. """
  137. Extract selected fields from a file of line-separated JSON tweets and
  138. write to a file in CSV format.
  139. This utility function allows a file of full Tweets to be easily converted
  140. to a CSV file for easier processing of Twitter entities. For example, the
  141. hashtags or media elements of a tweet can be extracted.
  142. It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags
  143. there will be two lines in the output file, one per hashtag
  144. :param tweets_file: the file-like object containing full Tweets
  145. :param str outfile: The path of the text file where results should be\
  146. written
  147. :param list main_fields: The list of fields to be extracted from the main\
  148. object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\
  149. <https://dev.twitter.com/overview/api/tweets> for a full list of fields.
  150. e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
  151. If `entity_type` is expressed with hierarchy, then it is the list of\
  152. fields of the object that corresponds to the key of the entity_type,\
  153. (e.g., for entity_type='user.urls', the fields in the main_fields list\
  154. belong to the user object; for entity_type='place.bounding_box', the\
  155. files in the main_field list belong to the place object of the tweet).
  156. :param list entity_type: The name of the entity: 'hashtags', 'media',\
  157. 'urls' and 'user_mentions' for the tweet object. For a user object,\
  158. this needs to be expressed with a hierarchy: `'user.urls'`. For the\
  159. bounding box of the Tweet location, use `'place.bounding_box'`.
  160. :param list entity_fields: The list of fields to be extracted from the\
  161. entity. E.g. `['text']` (of the Tweet)
  162. :param error: Behaviour for encoding errors, see\
  163. https://docs.python.org/3/library/codecs.html#codec-base-classes
  164. :param gzip_compress: if `True`, ouput files are compressed with gzip
  165. """
  166. (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
  167. header = get_header_field_list(main_fields, entity_type, entity_fields)
  168. writer.writerow(header)
  169. for line in tweets_file:
  170. tweet = json.loads(line)
  171. if _is_composed_key(entity_type):
  172. key, value = _get_key_value_composed(entity_type)
  173. object_json = _get_entity_recursive(tweet, key)
  174. if not object_json:
  175. # this can happen in the case of "place"
  176. continue
  177. object_fields = extract_fields(object_json, main_fields)
  178. items = _get_entity_recursive(object_json, value)
  179. _write_to_file(object_fields, items, entity_fields, writer)
  180. else:
  181. tweet_fields = extract_fields(tweet, main_fields)
  182. items = _get_entity_recursive(tweet, entity_type)
  183. _write_to_file(tweet_fields, items, entity_fields, writer)
  184. outf.close()
  185. def get_header_field_list(main_fields, entity_type, entity_fields):
  186. if _is_composed_key(entity_type):
  187. key, value = _get_key_value_composed(entity_type)
  188. main_entity = key
  189. sub_entity = value
  190. else:
  191. main_entity = None
  192. sub_entity = entity_type
  193. if main_entity:
  194. output1 = [HIER_SEPARATOR.join([main_entity, x]) for x in main_fields]
  195. else:
  196. output1 = main_fields
  197. output2 = [HIER_SEPARATOR.join([sub_entity, x]) for x in entity_fields]
  198. return output1 + output2
  199. def _write_to_file(object_fields, items, entity_fields, writer):
  200. if not items:
  201. # it could be that the entity is just not present for the tweet
  202. # e.g. tweet hashtag is always present, even as [], however
  203. # tweet media may not be present
  204. return
  205. if isinstance(items, dict):
  206. # this happens e.g. for "place" of a tweet
  207. row = object_fields
  208. # there might be composed keys in de list of required fields
  209. entity_field_values = [x for x in entity_fields if not _is_composed_key(x)]
  210. entity_field_composed = [x for x in entity_fields if _is_composed_key(x)]
  211. for field in entity_field_values:
  212. value = items[field]
  213. if isinstance(value, list):
  214. row += value
  215. else:
  216. row += [value]
  217. # now check required dictionaries
  218. for d in entity_field_composed:
  219. kd, vd = _get_key_value_composed(d)
  220. json_dict = items[kd]
  221. if not isinstance(json_dict, dict):
  222. raise RuntimeError(
  223. """Key {0} does not contain a dictionary
  224. in the json file""".format(
  225. kd
  226. )
  227. )
  228. row += [json_dict[vd]]
  229. writer.writerow(row)
  230. return
  231. # in general it is a list
  232. for item in items:
  233. row = object_fields + extract_fields(item, entity_fields)
  234. writer.writerow(row)