123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: Twitter client
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Ewan Klein <ewan@inf.ed.ac.uk>
- # Lorenzo Rubio <lrnzcig@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Utility functions for the :module:`twitterclient` module which do not require
- the `twython` library to have been installed.
- """
- from __future__ import print_function
- import csv
- import gzip
- import json
- from nltk import compat
- HIER_SEPARATOR = "."
- def extract_fields(tweet, fields):
- """
- Extract field values from a full tweet and return them as a list
- :param json tweet: The tweet in JSON format
- :param list fields: The fields to be extracted from the tweet
- :rtype: list(str)
- """
- out = []
- for field in fields:
- try:
- _add_field_to_out(tweet, field, out)
- except TypeError:
- raise RuntimeError(
- 'Fatal error when extracting fields. Cannot find field ', field
- )
- return out
- def _add_field_to_out(json, field, out):
- if _is_composed_key(field):
- key, value = _get_key_value_composed(field)
- _add_field_to_out(json[key], value, out)
- else:
- out += [json[field]]
- def _is_composed_key(field):
- if HIER_SEPARATOR in field:
- return True
- return False
- def _get_key_value_composed(field):
- out = field.split(HIER_SEPARATOR)
- # there could be up to 3 levels
- key = out[0]
- value = HIER_SEPARATOR.join(out[1:])
- return key, value
- def _get_entity_recursive(json, entity):
- if not json:
- return None
- elif isinstance(json, dict):
- for key, value in json.items():
- if key == entity:
- return value
- # 'entities' and 'extended_entities' are wrappers in Twitter json
- # structure that contain other Twitter objects. See:
- # https://dev.twitter.com/overview/api/entities-in-twitter-objects
- if key == 'entities' or key == 'extended_entities':
- candidate = _get_entity_recursive(value, entity)
- if candidate is not None:
- return candidate
- return None
- elif isinstance(json, list):
- for item in json:
- candidate = _get_entity_recursive(item, entity)
- if candidate is not None:
- return candidate
- return None
- else:
- return None
- def json2csv(
- fp, outfile, fields, encoding='utf8', errors='replace', gzip_compress=False
- ):
- """
- Extract selected fields from a file of line-separated JSON tweets and
- write to a file in CSV format.
- This utility function allows a file of full tweets to be easily converted
- to a CSV file for easier processing. For example, just TweetIDs or
- just the text content of the Tweets can be extracted.
- Additionally, the function allows combinations of fields of other Twitter
- objects (mainly the users, see below).
- For Twitter entities (e.g. hashtags of a Tweet), and for geolocation, see
- `json2csv_entities`
- :param str infile: The name of the file containing full tweets
- :param str outfile: The name of the text file where results should be\
- written
- :param list fields: The list of fields to be extracted. Useful examples\
- are 'id_str' for the tweetID and 'text' for the text of the tweet. See\
- <https://dev.twitter.com/overview/api/tweets> for a full list of fields.\
- e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']\
- Additionally, it allows IDs from other Twitter objects, e. g.,\
- ['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count']
- :param error: Behaviour for encoding errors, see\
- https://docs.python.org/3/library/codecs.html#codec-base-classes
- :param gzip_compress: if `True`, output files are compressed with gzip
- """
- (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
- # write the list of fields as header
- writer.writerow(fields)
- # process the file
- for line in fp:
- tweet = json.loads(line)
- row = extract_fields(tweet, fields)
- writer.writerow(row)
- outf.close()
- def outf_writer_compat(outfile, encoding, errors, gzip_compress=False):
- """
- Identify appropriate CSV writer given the Python version
- """
- if compat.PY3:
- if gzip_compress:
- outf = gzip.open(outfile, 'wt', encoding=encoding, errors=errors)
- else:
- outf = open(outfile, 'w', encoding=encoding, errors=errors)
- writer = csv.writer(outf)
- else:
- if gzip_compress:
- outf = gzip.open(outfile, 'wb')
- else:
- outf = open(outfile, 'wb')
- writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors)
- return (writer, outf)
- def json2csv_entities(
- tweets_file,
- outfile,
- main_fields,
- entity_type,
- entity_fields,
- encoding='utf8',
- errors='replace',
- gzip_compress=False,
- ):
- """
- Extract selected fields from a file of line-separated JSON tweets and
- write to a file in CSV format.
- This utility function allows a file of full Tweets to be easily converted
- to a CSV file for easier processing of Twitter entities. For example, the
- hashtags or media elements of a tweet can be extracted.
- It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags
- there will be two lines in the output file, one per hashtag
- :param tweets_file: the file-like object containing full Tweets
- :param str outfile: The path of the text file where results should be\
- written
- :param list main_fields: The list of fields to be extracted from the main\
- object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\
- <https://dev.twitter.com/overview/api/tweets> for a full list of fields.
- e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']
- If `entity_type` is expressed with hierarchy, then it is the list of\
- fields of the object that corresponds to the key of the entity_type,\
- (e.g., for entity_type='user.urls', the fields in the main_fields list\
- belong to the user object; for entity_type='place.bounding_box', the\
- files in the main_field list belong to the place object of the tweet).
- :param list entity_type: The name of the entity: 'hashtags', 'media',\
- 'urls' and 'user_mentions' for the tweet object. For a user object,\
- this needs to be expressed with a hierarchy: `'user.urls'`. For the\
- bounding box of the Tweet location, use `'place.bounding_box'`.
- :param list entity_fields: The list of fields to be extracted from the\
- entity. E.g. `['text']` (of the Tweet)
- :param error: Behaviour for encoding errors, see\
- https://docs.python.org/3/library/codecs.html#codec-base-classes
- :param gzip_compress: if `True`, ouput files are compressed with gzip
- """
- (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
- header = get_header_field_list(main_fields, entity_type, entity_fields)
- writer.writerow(header)
- for line in tweets_file:
- tweet = json.loads(line)
- if _is_composed_key(entity_type):
- key, value = _get_key_value_composed(entity_type)
- object_json = _get_entity_recursive(tweet, key)
- if not object_json:
- # this can happen in the case of "place"
- continue
- object_fields = extract_fields(object_json, main_fields)
- items = _get_entity_recursive(object_json, value)
- _write_to_file(object_fields, items, entity_fields, writer)
- else:
- tweet_fields = extract_fields(tweet, main_fields)
- items = _get_entity_recursive(tweet, entity_type)
- _write_to_file(tweet_fields, items, entity_fields, writer)
- outf.close()
- def get_header_field_list(main_fields, entity_type, entity_fields):
- if _is_composed_key(entity_type):
- key, value = _get_key_value_composed(entity_type)
- main_entity = key
- sub_entity = value
- else:
- main_entity = None
- sub_entity = entity_type
- if main_entity:
- output1 = [HIER_SEPARATOR.join([main_entity, x]) for x in main_fields]
- else:
- output1 = main_fields
- output2 = [HIER_SEPARATOR.join([sub_entity, x]) for x in entity_fields]
- return output1 + output2
- def _write_to_file(object_fields, items, entity_fields, writer):
- if not items:
- # it could be that the entity is just not present for the tweet
- # e.g. tweet hashtag is always present, even as [], however
- # tweet media may not be present
- return
- if isinstance(items, dict):
- # this happens e.g. for "place" of a tweet
- row = object_fields
- # there might be composed keys in de list of required fields
- entity_field_values = [x for x in entity_fields if not _is_composed_key(x)]
- entity_field_composed = [x for x in entity_fields if _is_composed_key(x)]
- for field in entity_field_values:
- value = items[field]
- if isinstance(value, list):
- row += value
- else:
- row += [value]
- # now check required dictionaries
- for d in entity_field_composed:
- kd, vd = _get_key_value_composed(d)
- json_dict = items[kd]
- if not isinstance(json_dict, dict):
- raise RuntimeError(
- """Key {0} does not contain a dictionary
- in the json file""".format(
- kd
- )
- )
- row += [json_dict[vd]]
- writer.writerow(row)
- return
- # in general it is a list
- for item in items:
- row = object_fields + extract_fields(item, entity_fields)
- writer.writerow(row)
|