Lexicon-based Sentiment Analysis#

import logging
from ekorpkit import eKonf

logging.basicConfig(level=logging.INFO)
print(eKonf.__version__)
0.1.33+11.g6ef57fc.dirty

Instantiating a sentiment analyser class#

cfg = eKonf.compose("model/sentiment=lm")
# cfg.verbose = True
# eKonf.print(cfg)
lmsa = eKonf.instantiate(cfg)

tokens = ["Fraud", "Good", "Good", "Good", "Sound", "uncertain", "beat", "wrong"]

lmsa.predict(tokens)
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...
INFO:ekorpkit.base:Calling load_candidates
INFO:ekorpkit.io.file:Processing [1] files from ['/workspace/projects/ekorpkit/ekorpkit/resources/lexicons/LM.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/projects/ekorpkit/ekorpkit/resources/lexicons/LM.parquet']
INFO:ekorpkit.io.file:Loading data from /workspace/projects/ekorpkit/ekorpkit/resources/lexicons/LM.parquet
INFO:ekorpkit.models.ngram.ngram:loaded 58142 candidates
{'num_tokens': 8,
 'polarity': -0.9999990000010001,
 'polarity_label': 'negative',
 'uncertainty': 0.125001}
text = "Beyond the improved voice capabilities, customers now have a streamlined way to comply with recalls and other traceability requirements, providing them with a competitive advantage."
features = lmsa.analyze(text, features=['Negative', 'Positive'])

eKonf.print(features)
{'advantage': {'Negative': 0, 'Positive': 2009, 'count': 1},
 'and': {'Negative': 0, 'Positive': 0, 'count': 1},
 'beyond': {'Negative': 0, 'Positive': 0, 'count': 1},
 'capability': {'Negative': 0, 'Positive': 0, 'count': 1},
 'competitive': {'Negative': 0, 'Positive': 0, 'count': 1},
 'comply': {'Negative': 0, 'Positive': 0, 'count': 1},
 'customer': {'Negative': 0, 'Positive': 0, 'count': 1},
 'have': {'Negative': 0, 'Positive': 0, 'count': 1},
 'improved': {'Negative': 0, 'Positive': 2009, 'count': 1},
 'now': {'Negative': 0, 'Positive': 0, 'count': 1},
 'other': {'Negative': 0, 'Positive': 0, 'count': 1},
 'provide': {'Negative': 0, 'Positive': 0, 'count': 1},
 'recall': {'Negative': 2009, 'Positive': 0, 'count': 1},
 'requirement': {'Negative': 0, 'Positive': 0, 'count': 1},
 'streamlined': {'Negative': 0, 'Positive': 0, 'count': 1},
 'the': {'Negative': 0, 'Positive': 0, 'count': 1},
 'them': {'Negative': 0, 'Positive': 0, 'count': 1},
 'to': {'Negative': 0, 'Positive': 0, 'count': 1},
 'traceability': {'Negative': 0, 'Positive': 0, 'count': 1},
 'voice': {'Negative': 0, 'Positive': 0, 'count': 1},
 'way': {'Negative': 0, 'Positive': 0, 'count': 1},
 'with': {'Negative': 0, 'Positive': 0, 'count': 2}}
text = "Operating loss amounted to EUR 0.7 mn compared to a profit of EUR 0.8 mn in the second quarter of 2005."
print(lmsa.predict(text))
{'num_tokens': 22, 'polarity': -0.9999990000010001, 'polarity_label': 'negative', 'uncertainty': 1e-06}
cfg = eKonf.compose('model/sentiment=hiv4')
hivsa = eKonf.instantiate(cfg)

tokens = ["Fraud", "Good","Good","Good", "Sound", "uncertain", "beat", "wrong", "legal"]

hivsa.predict(tokens)
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...
INFO:ekorpkit.base:Calling load_candidates
INFO:ekorpkit.io.file:Processing [1] files from ['/workspace/projects/ekorpkit/ekorpkit/resources/lexicons/HIV-4.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/projects/ekorpkit/ekorpkit/resources/lexicons/HIV-4.parquet']
INFO:ekorpkit.io.file:Loading data from /workspace/projects/ekorpkit/ekorpkit/resources/lexicons/HIV-4.parquet
INFO:ekorpkit.models.ngram.ngram:loaded 11787 candidates
{'num_tokens': 9,
 'polarity': 0.9999990000010001,
 'polarity_label': 'positive',
 'legal': 0.1111121111111111}
text = "Beyond the improved voice capabilities, customers now have a streamlined way to comply with recalls and other traceability requirements, providing them with a competitive advantage."
features = eKonf.print(hivsa.analyze(text, features=['Negativ', 'Positiv']))
print(hivsa.predict(text))
text = "Operating loss amounted to EUR 0.7 mn compared to a profit of EUR 0.8 mn in the second quarter of 2005."
print(hivsa.predict(text))
{'a': {'Negativ': None, 'Positiv': None, 'count': 2},
 'advantage': {'Negativ': None, 'Positiv': 'Positiv', 'count': 1},
 'and': {'Negativ': None, 'Positiv': None, 'count': 1},
 'beyond': {'Negativ': None, 'Positiv': None, 'count': 1},
 'competitive': {'Negativ': 'Negativ', 'Positiv': None, 'count': 1},
 'comply': {'Negativ': None, 'Positiv': None, 'count': 1},
 'now': {'Negativ': None, 'Positiv': None, 'count': 1},
 'the': {'Negativ': None, 'Positiv': None, 'count': 1},
 'them': {'Negativ': None, 'Positiv': None, 'count': 1},
 'with': {'Negativ': None, 'Positiv': None, 'count': 2}}
{'num_tokens': 28, 'polarity': 0.0, 'polarity_label': 'neutral', 'legal': 1e-06}
{'num_tokens': 22, 'polarity': -0.9999990000010001, 'polarity_label': 'negative', 'legal': 1e-06}
doc = [
    "Beyond the improved voice capabilities, customers now have a streamlined way to comply with recalls and other traceability requirements, providing them with a competitive advantage.",
    "Operating loss amounted to EUR 0.7 mn compared to a profit of EUR 0.8 mn in the second quarter of 2005.",
]

hivsa.predict_article('\n'.join(doc))
{'num_examples': 2,
 'polarity': -0.499999750000125,
 'polarity_label': 'negative'}