api.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Twitter API
  3. #
  4. # Copyright (C) 2001-2019 NLTK Project
  5. # Author: Ewan Klein <ewan@inf.ed.ac.uk>
  6. # Lorenzo Rubio <lrnzcig@gmail.com>
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """
  10. This module provides an interface for TweetHandlers, and support for timezone
  11. handling.
  12. """
  13. import time as _time
  14. from abc import ABCMeta, abstractmethod
  15. from datetime import tzinfo, timedelta, datetime
  16. from six import add_metaclass
  17. from nltk.compat import UTC
  18. class LocalTimezoneOffsetWithUTC(tzinfo):
  19. """
  20. This is not intended to be a general purpose class for dealing with the
  21. local timezone. In particular:
  22. * it assumes that the date passed has been created using
  23. `datetime(..., tzinfo=Local)`, where `Local` is an instance of
  24. the object `LocalTimezoneOffsetWithUTC`;
  25. * for such an object, it returns the offset with UTC, used for date comparisons.
  26. Reference: https://docs.python.org/3/library/datetime.html
  27. """
  28. STDOFFSET = timedelta(seconds=-_time.timezone)
  29. if _time.daylight:
  30. DSTOFFSET = timedelta(seconds=-_time.altzone)
  31. else:
  32. DSTOFFSET = STDOFFSET
  33. def utcoffset(self, dt):
  34. """
  35. Access the relevant time offset.
  36. """
  37. return self.DSTOFFSET
  38. LOCAL = LocalTimezoneOffsetWithUTC()
  39. @add_metaclass(ABCMeta)
  40. class BasicTweetHandler(object):
  41. """
  42. Minimal implementation of `TweetHandler`.
  43. Counts the number of Tweets and decides when the client should stop
  44. fetching them.
  45. """
  46. def __init__(self, limit=20):
  47. self.limit = limit
  48. self.counter = 0
  49. """
  50. A flag to indicate to the client whether to stop fetching data given
  51. some condition (e.g., reaching a date limit).
  52. """
  53. self.do_stop = False
  54. """
  55. Stores the id of the last fetched Tweet to handle pagination.
  56. """
  57. self.max_id = None
  58. def do_continue(self):
  59. """
  60. Returns `False` if the client should stop fetching Tweets.
  61. """
  62. return self.counter < self.limit and not self.do_stop
  63. class TweetHandlerI(BasicTweetHandler):
  64. """
  65. Interface class whose subclasses should implement a handle method that
  66. Twitter clients can delegate to.
  67. """
  68. def __init__(self, limit=20, upper_date_limit=None, lower_date_limit=None):
  69. """
  70. :param int limit: The number of data items to process in the current\
  71. round of processing.
  72. :param tuple upper_date_limit: The date at which to stop collecting\
  73. new data. This should be entered as a tuple which can serve as the\
  74. argument to `datetime.datetime`.\
  75. E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015.
  76. :param tuple lower_date_limit: The date at which to stop collecting\
  77. new data. See `upper_data_limit` for formatting.
  78. """
  79. BasicTweetHandler.__init__(self, limit)
  80. self.upper_date_limit = None
  81. self.lower_date_limit = None
  82. if upper_date_limit:
  83. self.upper_date_limit = datetime(*upper_date_limit, tzinfo=LOCAL)
  84. if lower_date_limit:
  85. self.lower_date_limit = datetime(*lower_date_limit, tzinfo=LOCAL)
  86. self.startingup = True
  87. @abstractmethod
  88. def handle(self, data):
  89. """
  90. Deal appropriately with data returned by the Twitter API
  91. """
  92. @abstractmethod
  93. def on_finish(self):
  94. """
  95. Actions when the tweet limit has been reached
  96. """
  97. def check_date_limit(self, data, verbose=False):
  98. """
  99. Validate date limits.
  100. """
  101. if self.upper_date_limit or self.lower_date_limit:
  102. date_fmt = '%a %b %d %H:%M:%S +0000 %Y'
  103. tweet_date = datetime.strptime(data['created_at'], date_fmt).replace(
  104. tzinfo=UTC
  105. )
  106. if (self.upper_date_limit and tweet_date > self.upper_date_limit) or (
  107. self.lower_date_limit and tweet_date < self.lower_date_limit
  108. ):
  109. if self.upper_date_limit:
  110. message = "earlier"
  111. date_limit = self.upper_date_limit
  112. else:
  113. message = "later"
  114. date_limit = self.lower_date_limit
  115. if verbose:
  116. print(
  117. "Date limit {0} is {1} than date of current tweet {2}".format(
  118. date_limit, message, tweet_date
  119. )
  120. )
  121. self.do_stop = True