N-Grams#

import logging
from ekorpkit import eKonf

logging.basicConfig(level=logging.INFO)
print(eKonf.__version__)
0.1.33+7.g877961c.dirty

Tokenize and extract tokens#

corpus_cfg = eKonf.compose("corpus")
corpus_cfg.name = "bok_minutes"
corpus_cfg.data_dir = "/workspace/data/datasets/corpus/ekorpkit"
tkn_cfg = eKonf.compose("preprocessor/tokenizer=mecab")
cfg = eKonf.compose("pipeline")
cfg.data.corpus = corpus_cfg
cfg._pipeline_ = ["tokenize", "explode_splits", "save_dataframe"]
cfg.num_workers = 100
cfg.tokenize.preprocessor.tokenizer = tkn_cfg
cfg.explode_splits.id_key = "id"
cfg.explode_splits.split_key = "sent_id"
cfg.explode_splits.separator = "\n"
cfg.save_dataframe.output_dir = "../data/bok"
cfg.save_dataframe.output_file = "bok_minutes_tokenized.parquet"

df = eKonf.instantiate(cfg)
df.tail()
id text split sent_id
181 181 또한/MAJ /SP 글로벌/NNG /SP 공급/NNG 망/NNG /SP 재편/NNG... train 321
181 181 향후/NNG /SP 경제/NNG 회복세/NNG 와/JC /SP 물가/NNG 의/JK... train 322
181 181 train 323
181 181 Government/SL ’/SY s/SL /SP View/SL train 324
181 181 train 325
tkn_cfg = eKonf.compose("preprocessor/tokenizer=mecab_econ")
tkn_cfg.extract.strip_pos = False

cfg = eKonf.compose("pipeline")
cfg.data.data_dir = "../data/bok"
cfg.data.data_file = "bok_minutes_tokenized.parquet"
cfg._pipeline_ = ["extract_tokens", "save_dataframe"]
cfg.num_workers = 100
cfg.verbose = True
cfg.extract_tokens.preprocessor.tokenizer = tkn_cfg
cfg.extract_tokens.nouns_only = False
cfg.save_dataframe.output_dir = "../data/bok"
cfg.save_dataframe.output_file = "bok_minutes_tokens.parquet"
df = eKonf.instantiate(cfg)
df.tail()
id text split sent_id
181 181 또한/MAJ 글로벌/NNG 공급/NNG 망/NNG 재편/NNG 기후/NNG 변화/N... train 321
181 181 향후/NNG 경제/NNG 회복세/NNG 와/JC 물가/NNG 의/JKG 흐름/NNG... train 322
181 181 train 323
181 181 Government/SL s/SL View/SL train 324
181 181 train 325

Train ngrams#

ngram_cfg = eKonf.compose("model/ngram=npmi")
ngram_cfg.data.data_dir = "../data/bok"
ngram_cfg.data.data_file = "bok_minutes_tokens.parquet"
ngram_cfg.verbose = True
ngram_cfg.auto.load = True
ngram_cfg.force.train = True
ngram_cfg.candidates.threshold = 0.4
ngram = eKonf.instantiate(ngram_cfg)
INFO:ekorpkit.base:instantiating ekorpkit.models.ngram.train.NgramTrainer...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...
INFO:ekorpkit.base:Calling initialize
INFO:ekorpkit.io.file:Processing [1] files from ['/workspace/projects/esgml/outputs/ngrams/ngram_npmi_scores.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/projects/esgml/outputs/ngrams/ngram_npmi_scores.parquet']
INFO:ekorpkit.io.file:Loading data from /workspace/projects/esgml/outputs/ngrams/ngram_npmi_scores.parquet
INFO:ekorpkit.io.file: >> elapsed time to load data: 0:00:00.038492
INFO:ekorpkit.models.ngram.ngram:loaded 42627 candidates
INFO:ekorpkit.base:instantiating ekorpkit.pipelines.data.Data...
INFO:ekorpkit.io.file:Processing [1] files from ['bok_minutes_tokens.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['../data/bok/bok_minutes_tokens.parquet']
INFO:ekorpkit.io.file:Loading data from ../data/bok/bok_minutes_tokens.parquet
9930it [00:09, 1036.58it/s]INFO:ekorpkit.models.ngram.score:pruned out 210818 tokens with count <=2 (before 263398, after 52580)
19929it [00:21, 864.90it/s]INFO:ekorpkit.models.ngram.score:pruned out 257768 tokens with count <=2 (before 343124, after 85356)
29911it [00:32, 1097.02it/s]INFO:ekorpkit.models.ngram.score:pruned out 191234 tokens with count <=2 (before 293933, after 102699)
36191it [00:38, 945.55it/s] 
learning ngrams was done. memory= 0.564 Gb
100%|██████████| 46494/46494 [00:00<00:00, 118903.01it/s]
INFO:ekorpkit.io.file:Saving dataframe to /workspace/projects/esgml/outputs/ngrams/ngram_npmi_scores.parquet
INFO:ekorpkit.io.file: >> elapsed time to save data: 0:00:00.107443
print(len(ngram._ngrams), len(ngram.candidates))
46494 42627
postag_rules = [["/NN", "/J", "/NN"], ["/XP", "/NN"], ["/XP", "/NN", "/NN"]]

ngram.export_ngrams(threshold=0.8, postag_rules=postag_rules)
[(('민/XPN', '감도/NNG'),
  score(words='민/XPN;감도/NNG', length=46494, frequency=27, score=0.990604973436752)),
 (('신/XPN', '흥국/NNG'),
  score(words='신/XPN;흥국/NNG', length=46494, frequency=1022, score=0.9603989288750724)),
 (('문안/NNG', '을/JKO', '작성/NNG'),
  score(words='문안/NNG;을/JKO;작성/NNG', length=46494, frequency=49, score=0.9549515116468318)),
 (('예/NNG', '의/JKG', '주시/NNG'),
  score(words='예/NNG;의/JKG;주시/NNG', length=46494, frequency=212, score=0.952727184984961)),
 (('한국/NNP', '은/JX', '행법/NNG'),
  score(words='한국/NNP;은/JX;행법/NNG', length=46494, frequency=13, score=0.9163779678011934)),
 (('한/NNP', '은/JX', '법/NNG'),
  score(words='한/NNP;은/JX;법/NNG', length=46494, frequency=15, score=0.9038684845812752)),
 (('취지/NNG', '로/JKB', '발언/NNG'),
  score(words='취지/NNG;로/JKB;발언/NNG', length=46494, frequency=38, score=0.8748905201033002)),
 (('신/XPN', '용스/NNP', '프레드/NNP'),
  score(words='신/XPN;용스/NNP;프레드/NNP', length=46494, frequency=112, score=0.8671915785294462)),
 (('차관/NNG', '에게/JKB', '발언/NNG'),
  score(words='차관/NNG;에게/JKB;발언/NNG', length=46494, frequency=31, score=0.8615166170202944)),
 (('양질/NNG', '의/JKG', '일자리/NNG'),
  score(words='양질/NNG;의/JKG;일자리/NNG', length=46494, frequency=15, score=0.8568614320936532)),
 (('시일/NNG', '이/JKS', '소요/NNG'),
  score(words='시일/NNG;이/JKS;소요/NNG', length=46494, frequency=28, score=0.8522311395365183)),
 (('고부/NNG', '가/JKS', '가치/NNG'),
  score(words='고부/NNG;가/JKS;가치/NNG', length=46494, frequency=10, score=0.8419083361774105)),
 (('저점/NNG', '을/JKO', '통과/NNG'),
  score(words='저점/NNG;을/JKO;통과/NNG', length=46494, frequency=13, score=0.8208059041912359)),
 (('북한/NNP', '의/JKG', '미사일/NNG'),
  score(words='북한/NNP;의/JKG;미사일/NNG', length=46494, frequency=11, score=0.813089753035318)),
 (('재/XPN', '정취/NNG', '약국/NNG'),
  score(words='재/XPN;정취/NNG;약국/NNG', length=46494, frequency=14, score=0.8102225991026599)),
 (('결시/NNG', '까지/JX', '콜/NNG'),
  score(words='결시/NNG;까지/JX;콜/NNG', length=46494, frequency=25, score=0.8063747933078849)),
 (('끝/NNG', '으로/JKB', '동/NNG'),
  score(words='끝/NNG;으로/JKB;동/NNG', length=46494, frequency=61, score=0.8030470926229916))]
sentence = ngram.sentences[18]
print(sentence)
tokens = ngram.ngramize_sentence(sentence, strip_pos=False, surface_delim=";")
print(tokens)
이/NP 에/JKB 대해/VV+EC 관련/NNG 부서/NNG 에서/JKB 는/JX 실업/NNG 률/XSN 과/JC 경기/NNG 지수/NNG 와/JKB 의/JKG 상관/NNG 관계/NNG 는/JX 그렇게/MAG 높/VA 지/EC 않/VX 은/ETM 것/NNB 으로/JKB 나타나/VV 며/EC 통계청/NNG 에서/JKB 발표/NNG 하/XSV 는/ETM 공식/NNG 적/XSN 인/VCP+ETM 실업/NNG 률/XSN 을/JKO 구직/NNG 단념/NNG 자/XSN 를/JKO 포함/NNG 한/XSA+ETM 광의/NNG 의/JKG 실업/NNG 률/XSN 로/JKB 전환/NNG 할/XSV+ETM 경우/NNG 오히려/MAJ 경기/NNG 와/JKB 의/JKG 연관/NNG 성/XSN 이/JKS 좀/MAG 더/MAG 높/VA 은/ETM 것/NNB 으로/JKB 분석/NNG 되/XSV 므로/EC 경기/NNG 상환/NNG 판단/NNG 을/JKO 위해서/VV+EC 는/JX 공식/NNG 적/XSN 인/VCP+ETM 실업/NNG 률/XSN 보다/JKB 는/JX 광의/NNG 의/JKG 실업/NNG 률/XSN 을/JKO 이용/NNG 하/XSV 는/ETM 것/NNB 이/JKS 더/MAG 좋/VA 을/ETM 것/NNB 같/VA 다고/EC 설명/NNG 하/XSV 였/EP 음/ETN
['이/NP', '에/JKB', '대해/VV+', '관련/NNG;부서/NNG', '에서/JKB', '는/JX', '실업/NNG;률/XSN', '과/JC', '경기/NNG', '지수/NNG', '와/JKB', '의/JKG', '상관/NNG;관계/NNG', '는/JX', '그렇게/MAG', '높/VA', '지/EC;않/VX', '은/ETM', '것/NNB;으로/JKB', '나타나/VV', '며/EC', '통계청/NNG;에서/JKB', '발표/NNG', '하/XSV;는/ETM', '공식/NNG', '적/XSN;인/VCP', '실업/NNG;률/XSN', '을/JKO', '구직/NNG', '단념/NNG', '자/XSN', '를/JKO', '포함/NNG;한/XSA', '광의/NNG;의/JKG;실업/NNG', '률/XSN', '로/JKB;전환/NNG', '할/XSV;경우/NNG', '오히려/MAJ', '경기/NNG;와/JKB;의/JKG', '연관/NNG', '성/XSN;이/JKS', '좀/MAG;더/MAG', '높/VA;은/ETM', '것/NNB;으로/JKB', '분석/NNG', '되/XSV', '므로/EC', '경기/NNG', '상환/NNG', '판단/NNG', '을/JKO', '위해서/VV+;는/JX', '공식/NNG', '적/XSN;인/VCP', '실업/NNG;률/XSN', '보다/JKB;는/JX', '광의/NNG;의/JKG;실업/NNG', '률/XSN', '을/JKO', '이용/NNG', '하/XSV', '는/ETM;것/NNB', '이/JKS', '더/MAG', '좋/VA', '을/ETM;것/NNB', '같/VA', '다고/EC', '설명/NNG', '하/XSV', '였/EP;음/ETN']
tokens = ngram[sentence]
print(tokens)
['이', '에', '대해', '관련부서', '에서', '는', '실업률', '과', '경기', '지수', '와', '의', '상관관계', '는', '그렇게', '높', '지않', '은', '것으로', '나타나', '며', '통계청에서', '발표', '하는', '공식', '적인', '실업률', '을', '구직', '단념', '자', '를', '포함한', '광의의실업', '률', '로전환', '할경우', '오히려', '경기와의', '연관', '성이', '좀더', '높은', '것으로', '분석', '되', '므로', '경기', '상환', '판단', '을', '위해서는', '공식', '적인', '실업률', '보다는', '광의의실업', '률', '을', '이용', '하', '는것', '이', '더', '좋', '을것', '같', '다고', '설명', '하', '였음']
_ngrams = ngram.find_ngrams(
    ngram.sentences[10],
    strip_pos=False,
    surface_delim=";",
    threshold=0.5,
    apply_postag_rules=False,
)
_ngrams
INFO:ekorpkit.models.ngram.ngram:found 25 ngrams
{'동/MM;위원/NNG': {'score': 0.8568080788235676, 'count': 1},
 '관련/NNG;하/XSV;여/EC': {'score': 0.6744817132834611, 'count': 1},
 '고용/NNG;사정/NNG': {'score': 0.6440199999085726, 'count': 1},
 '지/EC;않/VX': {'score': 0.9294521637628269, 'count': 2},
 '우리/NP;나라/NNG': {'score': 0.8562398240331328, 'count': 2},
 '실업/NNG;률/XSN': {'score': 0.6494435441593894, 'count': 3},
 '에/JKB;비해/VV+': {'score': 0.5012251931663576, 'count': 2},
 '지/EC;못하/VX': {'score': 0.7024120167866501, 'count': 1},
 '구조/NNG;적/XSN;요인/NNG': {'score': 0.546126756666045, 'count': 3},
 '적/XSN;인/VCP': {'score': 0.7342274264210907, 'count': 1},
 '측면/NNG;에서/JKB': {'score': 0.5471802970887485, 'count': 2},
 '해/XSV;보/VX;면/EC': {'score': 0.7278386924178137, 'count': 1},
 '방향/NNG;으로/JKB;가/VV': {'score': 0.5214619840461885, 'count': 2},
 '는/ETM;것/NNB': {'score': 0.5236409165544453, 'count': 1},
 '다는/ETM;견해/NNG': {'score': 0.7309617813933906, 'count': 1},
 '활동/NNG;참가/NNG': {'score': 0.7741200525059377, 'count': 1},
 '기/ETN;때문/NNB': {'score': 0.8196906558080235, 'count': 1},
 '같/VA;은/ETM': {'score': 0.7999751061740198, 'count': 1},
 '과/JKB;함께/MAG': {'score': 0.6284362488292308, 'count': 1},
 '좀/MAG;더/MAG': {'score': 0.8453728443412079, 'count': 1},
 '심/VV;도/EC': {'score': 0.9942257464499541, 'count': 1},
 '해/XSV;보/VX': {'score': 0.7018456967682065, 'count': 1},
 '필요/NNG;가/JKS;있/VA': {'score': 0.8839755298206097, 'count': 1},
 '의견/NNG;을/JKO;제시/NNG': {'score': 0.7695350089945394, 'count': 1},
 '였/EP;음/ETN': {'score': 0.8196123192196287, 'count': 1}}