123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627 |
- # coding: utf-8
- # Natural Language Toolkit: vader
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: C.J. Hutto <Clayton.Hutto@gtri.gatech.edu>
- # Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
- # Pierpaolo Pantone <24alsecondo@gmail.com> (modifications)
- # George Berry <geb97@cornell.edu> (modifications)
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- #
- # Modifications to the original VADER code have been made in order to
- # integrate it into NLTK. These have involved changes to
- # ensure Python 3 compatibility, and refactoring to achieve greater modularity.
- """
- If you use the VADER sentiment analysis tools, please cite:
- Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
- Sentiment Analysis of Social Media Text. Eighth International Conference on
- Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
- """
- import math
- import re
- import string
- from itertools import product
- import nltk.data
- from .util import pairwise
- ##Constants##
- # (empirically derived mean sentiment intensity rating increase for booster words)
- B_INCR = 0.293
- B_DECR = -0.293
- # (empirically derived mean sentiment intensity rating increase for using
- # ALLCAPs to emphasize a word)
- C_INCR = 0.733
- N_SCALAR = -0.74
- # for removing punctuation
- REGEX_REMOVE_PUNCTUATION = re.compile('[{0}]'.format(re.escape(string.punctuation)))
- PUNC_LIST = [
- ".",
- "!",
- "?",
- ",",
- ";",
- ":",
- "-",
- "'",
- "\"",
- "!!",
- "!!!",
- "??",
- "???",
- "?!?",
- "!?!",
- "?!?!",
- "!?!?",
- ]
- NEGATE = {
- "aint",
- "arent",
- "cannot",
- "cant",
- "couldnt",
- "darent",
- "didnt",
- "doesnt",
- "ain't",
- "aren't",
- "can't",
- "couldn't",
- "daren't",
- "didn't",
- "doesn't",
- "dont",
- "hadnt",
- "hasnt",
- "havent",
- "isnt",
- "mightnt",
- "mustnt",
- "neither",
- "don't",
- "hadn't",
- "hasn't",
- "haven't",
- "isn't",
- "mightn't",
- "mustn't",
- "neednt",
- "needn't",
- "never",
- "none",
- "nope",
- "nor",
- "not",
- "nothing",
- "nowhere",
- "oughtnt",
- "shant",
- "shouldnt",
- "uhuh",
- "wasnt",
- "werent",
- "oughtn't",
- "shan't",
- "shouldn't",
- "uh-uh",
- "wasn't",
- "weren't",
- "without",
- "wont",
- "wouldnt",
- "won't",
- "wouldn't",
- "rarely",
- "seldom",
- "despite",
- }
- # booster/dampener 'intensifiers' or 'degree adverbs'
- # http://en.wiktionary.org/wiki/Category:English_degree_adverbs
- BOOSTER_DICT = {
- "absolutely": B_INCR,
- "amazingly": B_INCR,
- "awfully": B_INCR,
- "completely": B_INCR,
- "considerably": B_INCR,
- "decidedly": B_INCR,
- "deeply": B_INCR,
- "effing": B_INCR,
- "enormously": B_INCR,
- "entirely": B_INCR,
- "especially": B_INCR,
- "exceptionally": B_INCR,
- "extremely": B_INCR,
- "fabulously": B_INCR,
- "flipping": B_INCR,
- "flippin": B_INCR,
- "fricking": B_INCR,
- "frickin": B_INCR,
- "frigging": B_INCR,
- "friggin": B_INCR,
- "fully": B_INCR,
- "fucking": B_INCR,
- "greatly": B_INCR,
- "hella": B_INCR,
- "highly": B_INCR,
- "hugely": B_INCR,
- "incredibly": B_INCR,
- "intensely": B_INCR,
- "majorly": B_INCR,
- "more": B_INCR,
- "most": B_INCR,
- "particularly": B_INCR,
- "purely": B_INCR,
- "quite": B_INCR,
- "really": B_INCR,
- "remarkably": B_INCR,
- "so": B_INCR,
- "substantially": B_INCR,
- "thoroughly": B_INCR,
- "totally": B_INCR,
- "tremendously": B_INCR,
- "uber": B_INCR,
- "unbelievably": B_INCR,
- "unusually": B_INCR,
- "utterly": B_INCR,
- "very": B_INCR,
- "almost": B_DECR,
- "barely": B_DECR,
- "hardly": B_DECR,
- "just enough": B_DECR,
- "kind of": B_DECR,
- "kinda": B_DECR,
- "kindof": B_DECR,
- "kind-of": B_DECR,
- "less": B_DECR,
- "little": B_DECR,
- "marginally": B_DECR,
- "occasionally": B_DECR,
- "partly": B_DECR,
- "scarcely": B_DECR,
- "slightly": B_DECR,
- "somewhat": B_DECR,
- "sort of": B_DECR,
- "sorta": B_DECR,
- "sortof": B_DECR,
- "sort-of": B_DECR,
- }
- # check for special case idioms using a sentiment-laden keyword known to SAGE
- SPECIAL_CASE_IDIOMS = {
- "the shit": 3,
- "the bomb": 3,
- "bad ass": 1.5,
- "yeah right": -2,
- "cut the mustard": 2,
- "kiss of death": -1.5,
- "hand to mouth": -2,
- }
- ##Static methods##
- def negated(input_words, include_nt=True):
- """
- Determine if input contains negation words
- """
- neg_words = NEGATE
- if any(word.lower() in neg_words for word in input_words):
- return True
- if include_nt:
- if any("n't" in word.lower() for word in input_words):
- return True
- for first, second in pairwise(input_words):
- if second.lower() == "least" and first.lower() != 'at':
- return True
- return False
- def normalize(score, alpha=15):
- """
- Normalize the score to be between -1 and 1 using an alpha that
- approximates the max expected value
- """
- norm_score = score / math.sqrt((score * score) + alpha)
- return norm_score
- def allcap_differential(words):
- """
- Check whether just some words in the input are ALL CAPS
- :param list words: The words to inspect
- :returns: `True` if some but not all items in `words` are ALL CAPS
- """
- is_different = False
- allcap_words = 0
- for word in words:
- if word.isupper():
- allcap_words += 1
- cap_differential = len(words) - allcap_words
- if 0 < cap_differential < len(words):
- is_different = True
- return is_different
- def scalar_inc_dec(word, valence, is_cap_diff):
- """
- Check if the preceding words increase, decrease, or negate/nullify the
- valence
- """
- scalar = 0.0
- word_lower = word.lower()
- if word_lower in BOOSTER_DICT:
- scalar = BOOSTER_DICT[word_lower]
- if valence < 0:
- scalar *= -1
- # check if booster/dampener word is in ALLCAPS (while others aren't)
- if word.isupper() and is_cap_diff:
- if valence > 0:
- scalar += C_INCR
- else:
- scalar -= C_INCR
- return scalar
- class SentiText(object):
- """
- Identify sentiment-relevant string-level properties of input text.
- """
- def __init__(self, text):
- if not isinstance(text, str):
- text = str(text.encode('utf-8'))
- self.text = text
- self.words_and_emoticons = self._words_and_emoticons()
- # doesn't separate words from\
- # adjacent punctuation (keeps emoticons & contractions)
- self.is_cap_diff = allcap_differential(self.words_and_emoticons)
- def _words_plus_punc(self):
- """
- Returns mapping of form:
- {
- 'cat,': 'cat',
- ',cat': 'cat',
- }
- """
- no_punc_text = REGEX_REMOVE_PUNCTUATION.sub('', self.text)
- # removes punctuation (but loses emoticons & contractions)
- words_only = no_punc_text.split()
- # remove singletons
- words_only = set(w for w in words_only if len(w) > 1)
- # the product gives ('cat', ',') and (',', 'cat')
- punc_before = {''.join(p): p[1] for p in product(PUNC_LIST, words_only)}
- punc_after = {''.join(p): p[0] for p in product(words_only, PUNC_LIST)}
- words_punc_dict = punc_before
- words_punc_dict.update(punc_after)
- return words_punc_dict
- def _words_and_emoticons(self):
- """
- Removes leading and trailing puncutation
- Leaves contractions and most emoticons
- Does not preserve punc-plus-letter emoticons (e.g. :D)
- """
- wes = self.text.split()
- words_punc_dict = self._words_plus_punc()
- wes = [we for we in wes if len(we) > 1]
- for i, we in enumerate(wes):
- if we in words_punc_dict:
- wes[i] = words_punc_dict[we]
- return wes
- class SentimentIntensityAnalyzer(object):
- """
- Give a sentiment intensity score to sentences.
- """
- def __init__(
- self, lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt"
- ):
- self.lexicon_file = nltk.data.load(lexicon_file)
- self.lexicon = self.make_lex_dict()
- def make_lex_dict(self):
- """
- Convert lexicon file to a dictionary
- """
- lex_dict = {}
- for line in self.lexicon_file.split('\n'):
- (word, measure) = line.strip().split('\t')[0:2]
- lex_dict[word] = float(measure)
- return lex_dict
- def polarity_scores(self, text):
- """
- Return a float for sentiment strength based on the input text.
- Positive values are positive valence, negative value are negative
- valence.
- """
- sentitext = SentiText(text)
- # text, words_and_emoticons, is_cap_diff = self.preprocess(text)
- sentiments = []
- words_and_emoticons = sentitext.words_and_emoticons
- for item in words_and_emoticons:
- valence = 0
- i = words_and_emoticons.index(item)
- if (
- i < len(words_and_emoticons) - 1
- and item.lower() == "kind"
- and words_and_emoticons[i + 1].lower() == "of"
- ) or item.lower() in BOOSTER_DICT:
- sentiments.append(valence)
- continue
- sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments)
- sentiments = self._but_check(words_and_emoticons, sentiments)
- return self.score_valence(sentiments, text)
- def sentiment_valence(self, valence, sentitext, item, i, sentiments):
- is_cap_diff = sentitext.is_cap_diff
- words_and_emoticons = sentitext.words_and_emoticons
- item_lowercase = item.lower()
- if item_lowercase in self.lexicon:
- # get the sentiment valence
- valence = self.lexicon[item_lowercase]
- # check if sentiment laden word is in ALL CAPS (while others aren't)
- if item.isupper() and is_cap_diff:
- if valence > 0:
- valence += C_INCR
- else:
- valence -= C_INCR
- for start_i in range(0, 3):
- if (
- i > start_i
- and words_and_emoticons[i - (start_i + 1)].lower()
- not in self.lexicon
- ):
- # dampen the scalar modifier of preceding words and emoticons
- # (excluding the ones that immediately preceed the item) based
- # on their distance from the current item.
- s = scalar_inc_dec(
- words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff
- )
- if start_i == 1 and s != 0:
- s = s * 0.95
- if start_i == 2 and s != 0:
- s = s * 0.9
- valence = valence + s
- valence = self._never_check(
- valence, words_and_emoticons, start_i, i
- )
- if start_i == 2:
- valence = self._idioms_check(valence, words_and_emoticons, i)
- # future work: consider other sentiment-laden idioms
- # other_idioms =
- # {"back handed": -2, "blow smoke": -2, "blowing smoke": -2,
- # "upper hand": 1, "break a leg": 2,
- # "cooking with gas": 2, "in the black": 2, "in the red": -2,
- # "on the ball": 2,"under the weather": -2}
- valence = self._least_check(valence, words_and_emoticons, i)
- sentiments.append(valence)
- return sentiments
- def _least_check(self, valence, words_and_emoticons, i):
- # check for negation case using "least"
- if (
- i > 1
- and words_and_emoticons[i - 1].lower() not in self.lexicon
- and words_and_emoticons[i - 1].lower() == "least"
- ):
- if (
- words_and_emoticons[i - 2].lower() != "at"
- and words_and_emoticons[i - 2].lower() != "very"
- ):
- valence = valence * N_SCALAR
- elif (
- i > 0
- and words_and_emoticons[i - 1].lower() not in self.lexicon
- and words_and_emoticons[i - 1].lower() == "least"
- ):
- valence = valence * N_SCALAR
- return valence
- def _but_check(self, words_and_emoticons, sentiments):
- # check for modification in sentiment due to contrastive conjunction 'but'
- if 'but' in words_and_emoticons or 'BUT' in words_and_emoticons:
- try:
- bi = words_and_emoticons.index('but')
- except ValueError:
- bi = words_and_emoticons.index('BUT')
- for sentiment in sentiments:
- si = sentiments.index(sentiment)
- if si < bi:
- sentiments.pop(si)
- sentiments.insert(si, sentiment * 0.5)
- elif si > bi:
- sentiments.pop(si)
- sentiments.insert(si, sentiment * 1.5)
- return sentiments
- def _idioms_check(self, valence, words_and_emoticons, i):
- onezero = "{0} {1}".format(words_and_emoticons[i - 1], words_and_emoticons[i])
- twoonezero = "{0} {1} {2}".format(
- words_and_emoticons[i - 2],
- words_and_emoticons[i - 1],
- words_and_emoticons[i],
- )
- twoone = "{0} {1}".format(
- words_and_emoticons[i - 2], words_and_emoticons[i - 1]
- )
- threetwoone = "{0} {1} {2}".format(
- words_and_emoticons[i - 3],
- words_and_emoticons[i - 2],
- words_and_emoticons[i - 1],
- )
- threetwo = "{0} {1}".format(
- words_and_emoticons[i - 3], words_and_emoticons[i - 2]
- )
- sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]
- for seq in sequences:
- if seq in SPECIAL_CASE_IDIOMS:
- valence = SPECIAL_CASE_IDIOMS[seq]
- break
- if len(words_and_emoticons) - 1 > i:
- zeroone = "{0} {1}".format(
- words_and_emoticons[i], words_and_emoticons[i + 1]
- )
- if zeroone in SPECIAL_CASE_IDIOMS:
- valence = SPECIAL_CASE_IDIOMS[zeroone]
- if len(words_and_emoticons) - 1 > i + 1:
- zeroonetwo = "{0} {1} {2}".format(
- words_and_emoticons[i],
- words_and_emoticons[i + 1],
- words_and_emoticons[i + 2],
- )
- if zeroonetwo in SPECIAL_CASE_IDIOMS:
- valence = SPECIAL_CASE_IDIOMS[zeroonetwo]
- # check for booster/dampener bi-grams such as 'sort of' or 'kind of'
- if threetwo in BOOSTER_DICT or twoone in BOOSTER_DICT:
- valence = valence + B_DECR
- return valence
- def _never_check(self, valence, words_and_emoticons, start_i, i):
- if start_i == 0:
- if negated([words_and_emoticons[i - 1]]):
- valence = valence * N_SCALAR
- if start_i == 1:
- if words_and_emoticons[i - 2] == "never" and (
- words_and_emoticons[i - 1] == "so"
- or words_and_emoticons[i - 1] == "this"
- ):
- valence = valence * 1.5
- elif negated([words_and_emoticons[i - (start_i + 1)]]):
- valence = valence * N_SCALAR
- if start_i == 2:
- if (
- words_and_emoticons[i - 3] == "never"
- and (
- words_and_emoticons[i - 2] == "so"
- or words_and_emoticons[i - 2] == "this"
- )
- or (
- words_and_emoticons[i - 1] == "so"
- or words_and_emoticons[i - 1] == "this"
- )
- ):
- valence = valence * 1.25
- elif negated([words_and_emoticons[i - (start_i + 1)]]):
- valence = valence * N_SCALAR
- return valence
- def _punctuation_emphasis(self, sum_s, text):
- # add emphasis from exclamation points and question marks
- ep_amplifier = self._amplify_ep(text)
- qm_amplifier = self._amplify_qm(text)
- punct_emph_amplifier = ep_amplifier + qm_amplifier
- return punct_emph_amplifier
- def _amplify_ep(self, text):
- # check for added emphasis resulting from exclamation points (up to 4 of them)
- ep_count = text.count("!")
- if ep_count > 4:
- ep_count = 4
- # (empirically derived mean sentiment intensity rating increase for
- # exclamation points)
- ep_amplifier = ep_count * 0.292
- return ep_amplifier
- def _amplify_qm(self, text):
- # check for added emphasis resulting from question marks (2 or 3+)
- qm_count = text.count("?")
- qm_amplifier = 0
- if qm_count > 1:
- if qm_count <= 3:
- # (empirically derived mean sentiment intensity rating increase for
- # question marks)
- qm_amplifier = qm_count * 0.18
- else:
- qm_amplifier = 0.96
- return qm_amplifier
- def _sift_sentiment_scores(self, sentiments):
- # want separate positive versus negative sentiment scores
- pos_sum = 0.0
- neg_sum = 0.0
- neu_count = 0
- for sentiment_score in sentiments:
- if sentiment_score > 0:
- pos_sum += (
- float(sentiment_score) + 1
- ) # compensates for neutral words that are counted as 1
- if sentiment_score < 0:
- neg_sum += (
- float(sentiment_score) - 1
- ) # when used with math.fabs(), compensates for neutrals
- if sentiment_score == 0:
- neu_count += 1
- return pos_sum, neg_sum, neu_count
- def score_valence(self, sentiments, text):
- if sentiments:
- sum_s = float(sum(sentiments))
- # compute and add emphasis from punctuation in text
- punct_emph_amplifier = self._punctuation_emphasis(sum_s, text)
- if sum_s > 0:
- sum_s += punct_emph_amplifier
- elif sum_s < 0:
- sum_s -= punct_emph_amplifier
- compound = normalize(sum_s)
- # discriminate between positive, negative and neutral sentiment scores
- pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)
- if pos_sum > math.fabs(neg_sum):
- pos_sum += punct_emph_amplifier
- elif pos_sum < math.fabs(neg_sum):
- neg_sum -= punct_emph_amplifier
- total = pos_sum + math.fabs(neg_sum) + neu_count
- pos = math.fabs(pos_sum / total)
- neg = math.fabs(neg_sum / total)
- neu = math.fabs(neu_count / total)
- else:
- compound = 0.0
- pos = 0.0
- neg = 0.0
- neu = 0.0
- sentiment_dict = {
- "neg": round(neg, 3),
- "neu": round(neu, 3),
- "pos": round(pos, 3),
- "compound": round(compound, 4),
- }
- return sentiment_dict
|