vader.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627
  1. # coding: utf-8
  2. # Natural Language Toolkit: vader
  3. #
  4. # Copyright (C) 2001-2019 NLTK Project
  5. # Author: C.J. Hutto <Clayton.Hutto@gtri.gatech.edu>
  6. # Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
  7. # Pierpaolo Pantone <24alsecondo@gmail.com> (modifications)
  8. # George Berry <geb97@cornell.edu> (modifications)
  9. # URL: <http://nltk.org/>
  10. # For license information, see LICENSE.TXT
  11. #
  12. # Modifications to the original VADER code have been made in order to
  13. # integrate it into NLTK. These have involved changes to
  14. # ensure Python 3 compatibility, and refactoring to achieve greater modularity.
  15. """
  16. If you use the VADER sentiment analysis tools, please cite:
  17. Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
  18. Sentiment Analysis of Social Media Text. Eighth International Conference on
  19. Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
  20. """
  21. import math
  22. import re
  23. import string
  24. from itertools import product
  25. import nltk.data
  26. from .util import pairwise
  27. ##Constants##
  28. # (empirically derived mean sentiment intensity rating increase for booster words)
  29. B_INCR = 0.293
  30. B_DECR = -0.293
  31. # (empirically derived mean sentiment intensity rating increase for using
  32. # ALLCAPs to emphasize a word)
  33. C_INCR = 0.733
  34. N_SCALAR = -0.74
  35. # for removing punctuation
  36. REGEX_REMOVE_PUNCTUATION = re.compile('[{0}]'.format(re.escape(string.punctuation)))
  37. PUNC_LIST = [
  38. ".",
  39. "!",
  40. "?",
  41. ",",
  42. ";",
  43. ":",
  44. "-",
  45. "'",
  46. "\"",
  47. "!!",
  48. "!!!",
  49. "??",
  50. "???",
  51. "?!?",
  52. "!?!",
  53. "?!?!",
  54. "!?!?",
  55. ]
  56. NEGATE = {
  57. "aint",
  58. "arent",
  59. "cannot",
  60. "cant",
  61. "couldnt",
  62. "darent",
  63. "didnt",
  64. "doesnt",
  65. "ain't",
  66. "aren't",
  67. "can't",
  68. "couldn't",
  69. "daren't",
  70. "didn't",
  71. "doesn't",
  72. "dont",
  73. "hadnt",
  74. "hasnt",
  75. "havent",
  76. "isnt",
  77. "mightnt",
  78. "mustnt",
  79. "neither",
  80. "don't",
  81. "hadn't",
  82. "hasn't",
  83. "haven't",
  84. "isn't",
  85. "mightn't",
  86. "mustn't",
  87. "neednt",
  88. "needn't",
  89. "never",
  90. "none",
  91. "nope",
  92. "nor",
  93. "not",
  94. "nothing",
  95. "nowhere",
  96. "oughtnt",
  97. "shant",
  98. "shouldnt",
  99. "uhuh",
  100. "wasnt",
  101. "werent",
  102. "oughtn't",
  103. "shan't",
  104. "shouldn't",
  105. "uh-uh",
  106. "wasn't",
  107. "weren't",
  108. "without",
  109. "wont",
  110. "wouldnt",
  111. "won't",
  112. "wouldn't",
  113. "rarely",
  114. "seldom",
  115. "despite",
  116. }
  117. # booster/dampener 'intensifiers' or 'degree adverbs'
  118. # http://en.wiktionary.org/wiki/Category:English_degree_adverbs
  119. BOOSTER_DICT = {
  120. "absolutely": B_INCR,
  121. "amazingly": B_INCR,
  122. "awfully": B_INCR,
  123. "completely": B_INCR,
  124. "considerably": B_INCR,
  125. "decidedly": B_INCR,
  126. "deeply": B_INCR,
  127. "effing": B_INCR,
  128. "enormously": B_INCR,
  129. "entirely": B_INCR,
  130. "especially": B_INCR,
  131. "exceptionally": B_INCR,
  132. "extremely": B_INCR,
  133. "fabulously": B_INCR,
  134. "flipping": B_INCR,
  135. "flippin": B_INCR,
  136. "fricking": B_INCR,
  137. "frickin": B_INCR,
  138. "frigging": B_INCR,
  139. "friggin": B_INCR,
  140. "fully": B_INCR,
  141. "fucking": B_INCR,
  142. "greatly": B_INCR,
  143. "hella": B_INCR,
  144. "highly": B_INCR,
  145. "hugely": B_INCR,
  146. "incredibly": B_INCR,
  147. "intensely": B_INCR,
  148. "majorly": B_INCR,
  149. "more": B_INCR,
  150. "most": B_INCR,
  151. "particularly": B_INCR,
  152. "purely": B_INCR,
  153. "quite": B_INCR,
  154. "really": B_INCR,
  155. "remarkably": B_INCR,
  156. "so": B_INCR,
  157. "substantially": B_INCR,
  158. "thoroughly": B_INCR,
  159. "totally": B_INCR,
  160. "tremendously": B_INCR,
  161. "uber": B_INCR,
  162. "unbelievably": B_INCR,
  163. "unusually": B_INCR,
  164. "utterly": B_INCR,
  165. "very": B_INCR,
  166. "almost": B_DECR,
  167. "barely": B_DECR,
  168. "hardly": B_DECR,
  169. "just enough": B_DECR,
  170. "kind of": B_DECR,
  171. "kinda": B_DECR,
  172. "kindof": B_DECR,
  173. "kind-of": B_DECR,
  174. "less": B_DECR,
  175. "little": B_DECR,
  176. "marginally": B_DECR,
  177. "occasionally": B_DECR,
  178. "partly": B_DECR,
  179. "scarcely": B_DECR,
  180. "slightly": B_DECR,
  181. "somewhat": B_DECR,
  182. "sort of": B_DECR,
  183. "sorta": B_DECR,
  184. "sortof": B_DECR,
  185. "sort-of": B_DECR,
  186. }
  187. # check for special case idioms using a sentiment-laden keyword known to SAGE
  188. SPECIAL_CASE_IDIOMS = {
  189. "the shit": 3,
  190. "the bomb": 3,
  191. "bad ass": 1.5,
  192. "yeah right": -2,
  193. "cut the mustard": 2,
  194. "kiss of death": -1.5,
  195. "hand to mouth": -2,
  196. }
  197. ##Static methods##
  198. def negated(input_words, include_nt=True):
  199. """
  200. Determine if input contains negation words
  201. """
  202. neg_words = NEGATE
  203. if any(word.lower() in neg_words for word in input_words):
  204. return True
  205. if include_nt:
  206. if any("n't" in word.lower() for word in input_words):
  207. return True
  208. for first, second in pairwise(input_words):
  209. if second.lower() == "least" and first.lower() != 'at':
  210. return True
  211. return False
  212. def normalize(score, alpha=15):
  213. """
  214. Normalize the score to be between -1 and 1 using an alpha that
  215. approximates the max expected value
  216. """
  217. norm_score = score / math.sqrt((score * score) + alpha)
  218. return norm_score
  219. def allcap_differential(words):
  220. """
  221. Check whether just some words in the input are ALL CAPS
  222. :param list words: The words to inspect
  223. :returns: `True` if some but not all items in `words` are ALL CAPS
  224. """
  225. is_different = False
  226. allcap_words = 0
  227. for word in words:
  228. if word.isupper():
  229. allcap_words += 1
  230. cap_differential = len(words) - allcap_words
  231. if 0 < cap_differential < len(words):
  232. is_different = True
  233. return is_different
  234. def scalar_inc_dec(word, valence, is_cap_diff):
  235. """
  236. Check if the preceding words increase, decrease, or negate/nullify the
  237. valence
  238. """
  239. scalar = 0.0
  240. word_lower = word.lower()
  241. if word_lower in BOOSTER_DICT:
  242. scalar = BOOSTER_DICT[word_lower]
  243. if valence < 0:
  244. scalar *= -1
  245. # check if booster/dampener word is in ALLCAPS (while others aren't)
  246. if word.isupper() and is_cap_diff:
  247. if valence > 0:
  248. scalar += C_INCR
  249. else:
  250. scalar -= C_INCR
  251. return scalar
  252. class SentiText(object):
  253. """
  254. Identify sentiment-relevant string-level properties of input text.
  255. """
  256. def __init__(self, text):
  257. if not isinstance(text, str):
  258. text = str(text.encode('utf-8'))
  259. self.text = text
  260. self.words_and_emoticons = self._words_and_emoticons()
  261. # doesn't separate words from\
  262. # adjacent punctuation (keeps emoticons & contractions)
  263. self.is_cap_diff = allcap_differential(self.words_and_emoticons)
  264. def _words_plus_punc(self):
  265. """
  266. Returns mapping of form:
  267. {
  268. 'cat,': 'cat',
  269. ',cat': 'cat',
  270. }
  271. """
  272. no_punc_text = REGEX_REMOVE_PUNCTUATION.sub('', self.text)
  273. # removes punctuation (but loses emoticons & contractions)
  274. words_only = no_punc_text.split()
  275. # remove singletons
  276. words_only = set(w for w in words_only if len(w) > 1)
  277. # the product gives ('cat', ',') and (',', 'cat')
  278. punc_before = {''.join(p): p[1] for p in product(PUNC_LIST, words_only)}
  279. punc_after = {''.join(p): p[0] for p in product(words_only, PUNC_LIST)}
  280. words_punc_dict = punc_before
  281. words_punc_dict.update(punc_after)
  282. return words_punc_dict
  283. def _words_and_emoticons(self):
  284. """
  285. Removes leading and trailing puncutation
  286. Leaves contractions and most emoticons
  287. Does not preserve punc-plus-letter emoticons (e.g. :D)
  288. """
  289. wes = self.text.split()
  290. words_punc_dict = self._words_plus_punc()
  291. wes = [we for we in wes if len(we) > 1]
  292. for i, we in enumerate(wes):
  293. if we in words_punc_dict:
  294. wes[i] = words_punc_dict[we]
  295. return wes
  296. class SentimentIntensityAnalyzer(object):
  297. """
  298. Give a sentiment intensity score to sentences.
  299. """
  300. def __init__(
  301. self, lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt"
  302. ):
  303. self.lexicon_file = nltk.data.load(lexicon_file)
  304. self.lexicon = self.make_lex_dict()
  305. def make_lex_dict(self):
  306. """
  307. Convert lexicon file to a dictionary
  308. """
  309. lex_dict = {}
  310. for line in self.lexicon_file.split('\n'):
  311. (word, measure) = line.strip().split('\t')[0:2]
  312. lex_dict[word] = float(measure)
  313. return lex_dict
  314. def polarity_scores(self, text):
  315. """
  316. Return a float for sentiment strength based on the input text.
  317. Positive values are positive valence, negative value are negative
  318. valence.
  319. """
  320. sentitext = SentiText(text)
  321. # text, words_and_emoticons, is_cap_diff = self.preprocess(text)
  322. sentiments = []
  323. words_and_emoticons = sentitext.words_and_emoticons
  324. for item in words_and_emoticons:
  325. valence = 0
  326. i = words_and_emoticons.index(item)
  327. if (
  328. i < len(words_and_emoticons) - 1
  329. and item.lower() == "kind"
  330. and words_and_emoticons[i + 1].lower() == "of"
  331. ) or item.lower() in BOOSTER_DICT:
  332. sentiments.append(valence)
  333. continue
  334. sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments)
  335. sentiments = self._but_check(words_and_emoticons, sentiments)
  336. return self.score_valence(sentiments, text)
  337. def sentiment_valence(self, valence, sentitext, item, i, sentiments):
  338. is_cap_diff = sentitext.is_cap_diff
  339. words_and_emoticons = sentitext.words_and_emoticons
  340. item_lowercase = item.lower()
  341. if item_lowercase in self.lexicon:
  342. # get the sentiment valence
  343. valence = self.lexicon[item_lowercase]
  344. # check if sentiment laden word is in ALL CAPS (while others aren't)
  345. if item.isupper() and is_cap_diff:
  346. if valence > 0:
  347. valence += C_INCR
  348. else:
  349. valence -= C_INCR
  350. for start_i in range(0, 3):
  351. if (
  352. i > start_i
  353. and words_and_emoticons[i - (start_i + 1)].lower()
  354. not in self.lexicon
  355. ):
  356. # dampen the scalar modifier of preceding words and emoticons
  357. # (excluding the ones that immediately preceed the item) based
  358. # on their distance from the current item.
  359. s = scalar_inc_dec(
  360. words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff
  361. )
  362. if start_i == 1 and s != 0:
  363. s = s * 0.95
  364. if start_i == 2 and s != 0:
  365. s = s * 0.9
  366. valence = valence + s
  367. valence = self._never_check(
  368. valence, words_and_emoticons, start_i, i
  369. )
  370. if start_i == 2:
  371. valence = self._idioms_check(valence, words_and_emoticons, i)
  372. # future work: consider other sentiment-laden idioms
  373. # other_idioms =
  374. # {"back handed": -2, "blow smoke": -2, "blowing smoke": -2,
  375. # "upper hand": 1, "break a leg": 2,
  376. # "cooking with gas": 2, "in the black": 2, "in the red": -2,
  377. # "on the ball": 2,"under the weather": -2}
  378. valence = self._least_check(valence, words_and_emoticons, i)
  379. sentiments.append(valence)
  380. return sentiments
  381. def _least_check(self, valence, words_and_emoticons, i):
  382. # check for negation case using "least"
  383. if (
  384. i > 1
  385. and words_and_emoticons[i - 1].lower() not in self.lexicon
  386. and words_and_emoticons[i - 1].lower() == "least"
  387. ):
  388. if (
  389. words_and_emoticons[i - 2].lower() != "at"
  390. and words_and_emoticons[i - 2].lower() != "very"
  391. ):
  392. valence = valence * N_SCALAR
  393. elif (
  394. i > 0
  395. and words_and_emoticons[i - 1].lower() not in self.lexicon
  396. and words_and_emoticons[i - 1].lower() == "least"
  397. ):
  398. valence = valence * N_SCALAR
  399. return valence
  400. def _but_check(self, words_and_emoticons, sentiments):
  401. # check for modification in sentiment due to contrastive conjunction 'but'
  402. if 'but' in words_and_emoticons or 'BUT' in words_and_emoticons:
  403. try:
  404. bi = words_and_emoticons.index('but')
  405. except ValueError:
  406. bi = words_and_emoticons.index('BUT')
  407. for sentiment in sentiments:
  408. si = sentiments.index(sentiment)
  409. if si < bi:
  410. sentiments.pop(si)
  411. sentiments.insert(si, sentiment * 0.5)
  412. elif si > bi:
  413. sentiments.pop(si)
  414. sentiments.insert(si, sentiment * 1.5)
  415. return sentiments
  416. def _idioms_check(self, valence, words_and_emoticons, i):
  417. onezero = "{0} {1}".format(words_and_emoticons[i - 1], words_and_emoticons[i])
  418. twoonezero = "{0} {1} {2}".format(
  419. words_and_emoticons[i - 2],
  420. words_and_emoticons[i - 1],
  421. words_and_emoticons[i],
  422. )
  423. twoone = "{0} {1}".format(
  424. words_and_emoticons[i - 2], words_and_emoticons[i - 1]
  425. )
  426. threetwoone = "{0} {1} {2}".format(
  427. words_and_emoticons[i - 3],
  428. words_and_emoticons[i - 2],
  429. words_and_emoticons[i - 1],
  430. )
  431. threetwo = "{0} {1}".format(
  432. words_and_emoticons[i - 3], words_and_emoticons[i - 2]
  433. )
  434. sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]
  435. for seq in sequences:
  436. if seq in SPECIAL_CASE_IDIOMS:
  437. valence = SPECIAL_CASE_IDIOMS[seq]
  438. break
  439. if len(words_and_emoticons) - 1 > i:
  440. zeroone = "{0} {1}".format(
  441. words_and_emoticons[i], words_and_emoticons[i + 1]
  442. )
  443. if zeroone in SPECIAL_CASE_IDIOMS:
  444. valence = SPECIAL_CASE_IDIOMS[zeroone]
  445. if len(words_and_emoticons) - 1 > i + 1:
  446. zeroonetwo = "{0} {1} {2}".format(
  447. words_and_emoticons[i],
  448. words_and_emoticons[i + 1],
  449. words_and_emoticons[i + 2],
  450. )
  451. if zeroonetwo in SPECIAL_CASE_IDIOMS:
  452. valence = SPECIAL_CASE_IDIOMS[zeroonetwo]
  453. # check for booster/dampener bi-grams such as 'sort of' or 'kind of'
  454. if threetwo in BOOSTER_DICT or twoone in BOOSTER_DICT:
  455. valence = valence + B_DECR
  456. return valence
  457. def _never_check(self, valence, words_and_emoticons, start_i, i):
  458. if start_i == 0:
  459. if negated([words_and_emoticons[i - 1]]):
  460. valence = valence * N_SCALAR
  461. if start_i == 1:
  462. if words_and_emoticons[i - 2] == "never" and (
  463. words_and_emoticons[i - 1] == "so"
  464. or words_and_emoticons[i - 1] == "this"
  465. ):
  466. valence = valence * 1.5
  467. elif negated([words_and_emoticons[i - (start_i + 1)]]):
  468. valence = valence * N_SCALAR
  469. if start_i == 2:
  470. if (
  471. words_and_emoticons[i - 3] == "never"
  472. and (
  473. words_and_emoticons[i - 2] == "so"
  474. or words_and_emoticons[i - 2] == "this"
  475. )
  476. or (
  477. words_and_emoticons[i - 1] == "so"
  478. or words_and_emoticons[i - 1] == "this"
  479. )
  480. ):
  481. valence = valence * 1.25
  482. elif negated([words_and_emoticons[i - (start_i + 1)]]):
  483. valence = valence * N_SCALAR
  484. return valence
  485. def _punctuation_emphasis(self, sum_s, text):
  486. # add emphasis from exclamation points and question marks
  487. ep_amplifier = self._amplify_ep(text)
  488. qm_amplifier = self._amplify_qm(text)
  489. punct_emph_amplifier = ep_amplifier + qm_amplifier
  490. return punct_emph_amplifier
  491. def _amplify_ep(self, text):
  492. # check for added emphasis resulting from exclamation points (up to 4 of them)
  493. ep_count = text.count("!")
  494. if ep_count > 4:
  495. ep_count = 4
  496. # (empirically derived mean sentiment intensity rating increase for
  497. # exclamation points)
  498. ep_amplifier = ep_count * 0.292
  499. return ep_amplifier
  500. def _amplify_qm(self, text):
  501. # check for added emphasis resulting from question marks (2 or 3+)
  502. qm_count = text.count("?")
  503. qm_amplifier = 0
  504. if qm_count > 1:
  505. if qm_count <= 3:
  506. # (empirically derived mean sentiment intensity rating increase for
  507. # question marks)
  508. qm_amplifier = qm_count * 0.18
  509. else:
  510. qm_amplifier = 0.96
  511. return qm_amplifier
  512. def _sift_sentiment_scores(self, sentiments):
  513. # want separate positive versus negative sentiment scores
  514. pos_sum = 0.0
  515. neg_sum = 0.0
  516. neu_count = 0
  517. for sentiment_score in sentiments:
  518. if sentiment_score > 0:
  519. pos_sum += (
  520. float(sentiment_score) + 1
  521. ) # compensates for neutral words that are counted as 1
  522. if sentiment_score < 0:
  523. neg_sum += (
  524. float(sentiment_score) - 1
  525. ) # when used with math.fabs(), compensates for neutrals
  526. if sentiment_score == 0:
  527. neu_count += 1
  528. return pos_sum, neg_sum, neu_count
  529. def score_valence(self, sentiments, text):
  530. if sentiments:
  531. sum_s = float(sum(sentiments))
  532. # compute and add emphasis from punctuation in text
  533. punct_emph_amplifier = self._punctuation_emphasis(sum_s, text)
  534. if sum_s > 0:
  535. sum_s += punct_emph_amplifier
  536. elif sum_s < 0:
  537. sum_s -= punct_emph_amplifier
  538. compound = normalize(sum_s)
  539. # discriminate between positive, negative and neutral sentiment scores
  540. pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)
  541. if pos_sum > math.fabs(neg_sum):
  542. pos_sum += punct_emph_amplifier
  543. elif pos_sum < math.fabs(neg_sum):
  544. neg_sum -= punct_emph_amplifier
  545. total = pos_sum + math.fabs(neg_sum) + neu_count
  546. pos = math.fabs(pos_sum / total)
  547. neg = math.fabs(neg_sum / total)
  548. neu = math.fabs(neu_count / total)
  549. else:
  550. compound = 0.0
  551. pos = 0.0
  552. neg = 0.0
  553. neu = 0.0
  554. sentiment_dict = {
  555. "neg": round(neg, 3),
  556. "neu": round(neu, 3),
  557. "pos": round(pos, 3),
  558. "compound": round(compound, 4),
  559. }
  560. return sentiment_dict