# N-Gram model for ngram lexicon features

In [2]:
from ekorpkit import eKonf

eKonf.setLogger("WARNING")
print("version:", eKonf.__version__)
print("is notebook?", eKonf.is_notebook())
print("is colab?", eKonf.is_colab())
print("evironment varialbles:")
eKonf.print(eKonf.env().dict())

INFO:ekorpkit.base:IPython version: (6, 9, 0), client: jupyter_client
INFO:ekorpkit.base:Google Colab not detected.


version: 0.1.35+1.gbeed9e1
is notebook? True
is colab? False
evironment varialbles:
{'CUDA_DEVICE_ORDER': None,
 'CUDA_VISIBLE_DEVICES': None,
 'EKORPKIT_CONFIG_DIR': '/workspace/projects/ekorpkit-book/config',
 'EKORPKIT_DATA_DIR': None,
 'EKORPKIT_PROJECT': 'ekorpkit-book',
 'EKORPKIT_WORKSPACE_ROOT': '/workspace',
 'KMP_DUPLICATE_LIB_OK': 'TRUE',
 'NUM_WORKERS': 230}


## Load a ngram model with MPKO lexicon scores

In [2]:
ngram_cfg = eKonf.compose("model/ngram=mpko_lex")
ngram_cfg.verbose = True
ngram_cfg.auto.load = True
ngram = eKonf.instantiate(ngram_cfg)

INFO:ekorpkit.base:instantiating ekorpkit.models.ngram.ngram.Ngrams...
INFO:ekorpkit.preprocessors.tokenizer:Initializing mecab with {'userdic_path': None, 'backend': 'mecab-python3', 'verbose': False}...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...
INFO:ekorpkit.base:Calling load_candidates
INFO:ekorpkit.io.file:Processing [1] files from ['/workspace/projects/ekorpkit/ekorpkit/resources/lexicons/mpko/mp_polarity_lexicon_lex.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/projects/ekorpkit/ekorpkit/resources/lexicons/mpko/mp_polarity_lexicon_lex.parquet']
INFO:ekorpkit.io.file:Loading data from /workspace/projects/ekorpkit/ekorpkit/resources/lexicons/mpko/mp_polarity_lexicon_lex.parquet
INFO:ekorpkit.io.file: >> elapsed time to load data: 0:00:00.035776
INFO:ekorpkit.models.ngram.ngram:loaded 23956 candidates


In [3]:
print(f"Number of candidates: {len(ngram.candidates)}")

Number of candidates: 23956


In [4]:
ngram.export_ngrams(threshold=0.9)

[(('투기/NNG', '억제/NNG'),
  score(word='투기/NNG;억제/NNG', label=1, polarity=0.9984224740344032, intensity=76.89998054169148, pos_score=842.336810100104, neg_score=10.95366740233997)),
 (('금리/NNG', '상승/NNG'),
  score(word='금리/NNG;상승/NNG', label=1, polarity=0.9976489029338528, intensity=74.78067419154141, pos_score=852.2582592674567, neg_score=11.396771538653194)),
 (('채권/NNG', '가격/NNG', '하락/NNG'),
  score(word='채권/NNG;가격/NNG;하락/NNG', label=1, polarity=0.9965068843156996, intensity=71.85459337496019, pos_score=830.1285320910899, neg_score=11.552894437231236)),
 (('인플레이션/NNG', '압력/NNG'),
  score(word='인플레이션/NNG;압력/NNG', label=1, polarity=0.9962771197986816, intensity=71.29298491650549, pos_score=851.9671447928356, neg_score=11.950224075911729)),
 (('물가/NNG', '상승/NNG'),
  score(word='물가/NNG;상승/NNG', label=1, polarity=0.9892951767944276, intensity=57.572641724969934, pos_score=840.8393220118454, neg_score=14.604841758497308)),
 (('부동산/NNG', '가격/NNG', '상승/NNG'),
  score(word='부동산/NNG;가격/NNG;상승/N

In [5]:
sentence = "투기를 억제하기 위해 금리를 인상해야 한다."
tokens = ngram.tokenize(sentence)
print(tokens)

['투기/NNG', '억제/NNG', '하/XSV', '금리/NNG', '인상/NNG', '해야/XSV']


In [6]:
tokens = ngram.ngramize_sentence(sentence)
print(tokens)

['투기/NNG', '억제/NNG;금리/NNG;인상/NNG', '하/XSV', '해야/XSV']


In [7]:
corpus_cfg = eKonf.compose("corpus")
corpus_cfg.name = "bok_minutes"
corpus_cfg.data_dir = "/workspace/data/datasets/corpus/ekorpkit"
corpus = eKonf.instantiate(corpus_cfg)
corpus.data.tail()

INFO:ekorpkit.datasets.base:Loaded info file: /workspace/data/datasets/corpus/ekorpkit/bok_minutes/info-bok_minutes.yaml
INFO:ekorpkit.io.file:Processing [1] files from ['bok_minutes-train.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/data/datasets/corpus/ekorpkit/bok_minutes/bok_minutes-train.parquet']
INFO:ekorpkit.io.file:Loading data from /workspace/data/datasets/corpus/ekorpkit/bok_minutes/bok_minutes-train.parquet
INFO:ekorpkit.info.column:index: index, index of data: None, columns: ['id', 'text'], id: ['id']
INFO:ekorpkit.info.column:Adding id [split] to ['id']
INFO:ekorpkit.info.column:Added id [split], now ['id', 'split']
INFO:ekorpkit.info.column:Added a column [split] with value [train]
INFO:ekorpkit.io.file:Processing [1] files from ['meta-bok_minutes-train.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/data/datasets/corpus/ekorpkit/bok_minutes/meta-bok_minutes-train.parquet']
INFO:ekorpkit.io.file:Loading data from /workspac

Unnamed: 0,id,text,split
181,181,Economic Situation\n일부 위원은 이번 전망에서 올해 소비자물가 상승...,train
182,182,Economic Situation\n일부 위원은 감염병 재확산에도 불구하고 성장경로...,train
183,183,Economic Situation\n일부 위원은 이번 전망내용을 보면 성장은 하방리...,train
184,184,"Economic Situation\n일부 위원은 우크라이나 사태, 공급망 차질, 중...",train
185,185,Economic Situation\n일부 위원은 향후 국내외 경제흐름이 인플레이션에...,train


In [8]:
ngram._ngram.max_window = 7
ngram._ngram.max_skip = 3
_ngrams = ngram.find_ngrams(
    corpus.data.text[1], ignore_scores=False, strip_pos=False, use_surfaces_to_score=True
)

INFO:ekorpkit.io.file:Concatenating 1 dataframes
INFO:ekorpkit.models.ngram.ngram:found 99 ngrams


In [9]:
_ngrams

{'경제/NNG;성장/NNG': {'polarity': 0.5204387993211343, 'count': 1},
 '근원/NNG;인플레이션/NNG;상승/NNG': {'polarity': 0.2543270656612199, 'count': 1},
 '소매/NNG;판매/NNG;부진/NNG': {'polarity': -0.0916732941026204, 'count': 1},
 '하락/NNG': {'polarity': -0.9974931628848912, 'count': 5},
 '가격/NNG;상승/NNG': {'polarity': 0.0582057939293445, 'count': 1},
 '높/VA;경제/NNG;성장/NNG': {'polarity': 0.6785215318580916, 'count': 2},
 '경제/NNG;낮/VA': {'polarity': 0.1811089746302118, 'count': 1},
 '투자/NNG;축소/NNG': {'polarity': -0.0091250155342883, 'count': 1},
 '실적/NNG;예상/NNG;하회/NNG': {'polarity': -0.0633039043188929, 'count': 1},
 '투자/NNG;수요/NNG;부진/NNG': {'polarity': -0.0567764807675792, 'count': 1},
 '실망/NNG': {'polarity': -0.0537436125235228, 'count': 1},
 '고용/NNG;악화/NNG': {'polarity': -0.624421305209124, 'count': 1},
 '실업/NNG;증가/NNG': {'polarity': -0.0971255742845687, 'count': 2},
 '악화/NNG;경제/NNG;성장/NNG': {'polarity': -0.4967007039179579, 'count': 1},
 '예상/NNG;낮/VA': {'polarity': -0.5326471339885891, 'count': 1},
 '성장/N

In [9]:
_features = ngram.find_features(corpus.data.text[1])

INFO:ekorpkit.io.file:Concatenating 1 dataframes
INFO:ekorpkit.models.ngram.ngram:found 99 ngrams


In [10]:
_features

{'경제/NNG;성장/NNG': {'label': 1,
  'polarity': 0.5204387993211343,
  'intensity': 3.261446146626619,
  'pos_score': 53.81467139538686,
  'neg_score': 16.50024834874201,
  'count': 1},
 '근원/NNG;인플레이션/NNG;상승/NNG': {'label': 1,
  'polarity': 0.2543270656612199,
  'intensity': 1.7920432753856634,
  'pos_score': 21.20702197813539,
  'neg_score': 11.8339898759261,
  'count': 1},
 '소매/NNG;판매/NNG;부진/NNG': {'label': -1,
  'polarity': -0.0916732941026204,
  'intensity': 1.3059461234579393,
  'pos_score': 9.189591593573098,
  'neg_score': 12.001111517788454,
  'count': 1},
 '하락/NNG': {'label': -1,
  'polarity': -0.9974931628848912,
  'intensity': 72.9138425756711,
  'pos_score': 11.736812558265145,
  'neg_score': 855.7761032135043,
  'count': 5},
 '가격/NNG;상승/NNG': {'label': 1,
  'polarity': 0.0582057939293445,
  'intensity': 1.2262908563119794,
  'pos_score': 48.02782791432222,
  'neg_score': 39.16511948785461,
  'count': 1},
 '높/VA;경제/NNG;성장/NNG': {'label': 1,
  'polarity': 0.6785215318580916,
  '