{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "AESO0odcxhzs"
},
"source": [
"# N-Grams"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.1.33+7.g877961c.dirty\n"
]
}
],
"source": [
"import logging\n",
"from ekorpkit import eKonf\n",
"\n",
"logging.basicConfig(level=logging.INFO)\n",
"print(eKonf.__version__)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "dOpN8AxSxhzv"
},
"source": [
"## Tokenize and extract tokens"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"corpus_cfg = eKonf.compose(\"corpus\")\n",
"corpus_cfg.name = \"bok_minutes\"\n",
"corpus_cfg.data_dir = \"/workspace/data/datasets/corpus/ekorpkit\"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6a07b8108fb14113a6f1077b89e26459",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Tokenizing column: text: 0%| | 0/182 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" text | \n",
" split | \n",
" sent_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 181 | \n",
" 181 | \n",
" 또한/MAJ /SP 글로벌/NNG /SP 공급/NNG 망/NNG /SP 재편/NNG... | \n",
" train | \n",
" 321 | \n",
"
\n",
" \n",
" 181 | \n",
" 181 | \n",
" 향후/NNG /SP 경제/NNG 회복세/NNG 와/JC /SP 물가/NNG 의/JK... | \n",
" train | \n",
" 322 | \n",
"
\n",
" \n",
" 181 | \n",
" 181 | \n",
" | \n",
" train | \n",
" 323 | \n",
"
\n",
" \n",
" 181 | \n",
" 181 | \n",
" Government/SL ’/SY s/SL /SP View/SL | \n",
" train | \n",
" 324 | \n",
"
\n",
" \n",
" 181 | \n",
" 181 | \n",
" | \n",
" train | \n",
" 325 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id text split sent_id\n",
"181 181 또한/MAJ /SP 글로벌/NNG /SP 공급/NNG 망/NNG /SP 재편/NNG... train 321\n",
"181 181 향후/NNG /SP 경제/NNG 회복세/NNG 와/JC /SP 물가/NNG 의/JK... train 322\n",
"181 181 train 323\n",
"181 181 Government/SL ’/SY s/SL /SP View/SL train 324\n",
"181 181 train 325"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tkn_cfg = eKonf.compose(\"preprocessor/tokenizer=mecab\")\n",
"cfg = eKonf.compose(\"pipeline\")\n",
"cfg.data.corpus = corpus_cfg\n",
"cfg._pipeline_ = [\"tokenize\", \"explode_splits\", \"save_dataframe\"]\n",
"cfg.num_workers = 100\n",
"cfg.tokenize.preprocessor.tokenizer = tkn_cfg\n",
"cfg.explode_splits.id_key = \"id\"\n",
"cfg.explode_splits.split_key = \"sent_id\"\n",
"cfg.explode_splits.separator = \"\\n\"\n",
"cfg.save_dataframe.output_dir = \"../data/bok\"\n",
"cfg.save_dataframe.output_file = \"bok_minutes_tokenized.parquet\"\n",
"\n",
"df = eKonf.instantiate(cfg)\n",
"df.tail()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9baca07af8fb496f8354aef92b2c8630",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Extracting column: text: 0%| | 0/36191 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" text | \n",
" split | \n",
" sent_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 181 | \n",
" 181 | \n",
" 또한/MAJ 글로벌/NNG 공급/NNG 망/NNG 재편/NNG 기후/NNG 변화/N... | \n",
" train | \n",
" 321 | \n",
"
\n",
" \n",
" 181 | \n",
" 181 | \n",
" 향후/NNG 경제/NNG 회복세/NNG 와/JC 물가/NNG 의/JKG 흐름/NNG... | \n",
" train | \n",
" 322 | \n",
"
\n",
" \n",
" 181 | \n",
" 181 | \n",
" | \n",
" train | \n",
" 323 | \n",
"
\n",
" \n",
" 181 | \n",
" 181 | \n",
" Government/SL s/SL View/SL | \n",
" train | \n",
" 324 | \n",
"
\n",
" \n",
" 181 | \n",
" 181 | \n",
" | \n",
" train | \n",
" 325 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id text split sent_id\n",
"181 181 또한/MAJ 글로벌/NNG 공급/NNG 망/NNG 재편/NNG 기후/NNG 변화/N... train 321\n",
"181 181 향후/NNG 경제/NNG 회복세/NNG 와/JC 물가/NNG 의/JKG 흐름/NNG... train 322\n",
"181 181 train 323\n",
"181 181 Government/SL s/SL View/SL train 324\n",
"181 181 train 325"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tkn_cfg = eKonf.compose(\"preprocessor/tokenizer=mecab_econ\")\n",
"tkn_cfg.extract.strip_pos = False\n",
"\n",
"cfg = eKonf.compose(\"pipeline\")\n",
"cfg.data.data_dir = \"../data/bok\"\n",
"cfg.data.data_file = \"bok_minutes_tokenized.parquet\"\n",
"cfg._pipeline_ = [\"extract_tokens\", \"save_dataframe\"]\n",
"cfg.num_workers = 100\n",
"cfg.verbose = True\n",
"cfg.extract_tokens.preprocessor.tokenizer = tkn_cfg\n",
"cfg.extract_tokens.nouns_only = False\n",
"cfg.save_dataframe.output_dir = \"../data/bok\"\n",
"cfg.save_dataframe.output_file = \"bok_minutes_tokens.parquet\"\n",
"df = eKonf.instantiate(cfg)\n",
"df.tail()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train ngrams"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:ekorpkit.base:instantiating ekorpkit.models.ngram.train.NgramTrainer...\n",
"INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...\n",
"INFO:ekorpkit.base:Calling initialize\n",
"INFO:ekorpkit.io.file:Processing [1] files from ['/workspace/projects/esgml/outputs/ngrams/ngram_npmi_scores.parquet']\n",
"INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/projects/esgml/outputs/ngrams/ngram_npmi_scores.parquet']\n",
"INFO:ekorpkit.io.file:Loading data from /workspace/projects/esgml/outputs/ngrams/ngram_npmi_scores.parquet\n",
"INFO:ekorpkit.io.file: >> elapsed time to load data: 0:00:00.038492\n",
"INFO:ekorpkit.models.ngram.ngram:loaded 42627 candidates\n",
"INFO:ekorpkit.base:instantiating ekorpkit.pipelines.data.Data...\n",
"INFO:ekorpkit.io.file:Processing [1] files from ['bok_minutes_tokens.parquet']\n",
"INFO:ekorpkit.io.file:Loading 1 dataframes from ['../data/bok/bok_minutes_tokens.parquet']\n",
"INFO:ekorpkit.io.file:Loading data from ../data/bok/bok_minutes_tokens.parquet\n",
"9930it [00:09, 1036.58it/s]INFO:ekorpkit.models.ngram.score:pruned out 210818 tokens with count <=2 (before 263398, after 52580)\n",
"19929it [00:21, 864.90it/s]INFO:ekorpkit.models.ngram.score:pruned out 257768 tokens with count <=2 (before 343124, after 85356)\n",
"29911it [00:32, 1097.02it/s]INFO:ekorpkit.models.ngram.score:pruned out 191234 tokens with count <=2 (before 293933, after 102699)\n",
"36191it [00:38, 945.55it/s] \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"learning ngrams was done. memory= 0.564 Gb\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 46494/46494 [00:00<00:00, 118903.01it/s]\n",
"INFO:ekorpkit.io.file:Saving dataframe to /workspace/projects/esgml/outputs/ngrams/ngram_npmi_scores.parquet\n",
"INFO:ekorpkit.io.file: >> elapsed time to save data: 0:00:00.107443\n"
]
}
],
"source": [
"ngram_cfg = eKonf.compose(\"model/ngram=npmi\")\n",
"ngram_cfg.data.data_dir = \"../data/bok\"\n",
"ngram_cfg.data.data_file = \"bok_minutes_tokens.parquet\"\n",
"ngram_cfg.verbose = True\n",
"ngram_cfg.auto.load = True\n",
"ngram_cfg.force.train = True\n",
"ngram_cfg.candidates.threshold = 0.4\n",
"ngram = eKonf.instantiate(ngram_cfg)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"46494 42627\n"
]
}
],
"source": [
"print(len(ngram._ngrams), len(ngram.candidates))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(('민/XPN', '감도/NNG'),\n",
" score(words='민/XPN;감도/NNG', length=46494, frequency=27, score=0.990604973436752)),\n",
" (('신/XPN', '흥국/NNG'),\n",
" score(words='신/XPN;흥국/NNG', length=46494, frequency=1022, score=0.9603989288750724)),\n",
" (('문안/NNG', '을/JKO', '작성/NNG'),\n",
" score(words='문안/NNG;을/JKO;작성/NNG', length=46494, frequency=49, score=0.9549515116468318)),\n",
" (('예/NNG', '의/JKG', '주시/NNG'),\n",
" score(words='예/NNG;의/JKG;주시/NNG', length=46494, frequency=212, score=0.952727184984961)),\n",
" (('한국/NNP', '은/JX', '행법/NNG'),\n",
" score(words='한국/NNP;은/JX;행법/NNG', length=46494, frequency=13, score=0.9163779678011934)),\n",
" (('한/NNP', '은/JX', '법/NNG'),\n",
" score(words='한/NNP;은/JX;법/NNG', length=46494, frequency=15, score=0.9038684845812752)),\n",
" (('취지/NNG', '로/JKB', '발언/NNG'),\n",
" score(words='취지/NNG;로/JKB;발언/NNG', length=46494, frequency=38, score=0.8748905201033002)),\n",
" (('신/XPN', '용스/NNP', '프레드/NNP'),\n",
" score(words='신/XPN;용스/NNP;프레드/NNP', length=46494, frequency=112, score=0.8671915785294462)),\n",
" (('차관/NNG', '에게/JKB', '발언/NNG'),\n",
" score(words='차관/NNG;에게/JKB;발언/NNG', length=46494, frequency=31, score=0.8615166170202944)),\n",
" (('양질/NNG', '의/JKG', '일자리/NNG'),\n",
" score(words='양질/NNG;의/JKG;일자리/NNG', length=46494, frequency=15, score=0.8568614320936532)),\n",
" (('시일/NNG', '이/JKS', '소요/NNG'),\n",
" score(words='시일/NNG;이/JKS;소요/NNG', length=46494, frequency=28, score=0.8522311395365183)),\n",
" (('고부/NNG', '가/JKS', '가치/NNG'),\n",
" score(words='고부/NNG;가/JKS;가치/NNG', length=46494, frequency=10, score=0.8419083361774105)),\n",
" (('저점/NNG', '을/JKO', '통과/NNG'),\n",
" score(words='저점/NNG;을/JKO;통과/NNG', length=46494, frequency=13, score=0.8208059041912359)),\n",
" (('북한/NNP', '의/JKG', '미사일/NNG'),\n",
" score(words='북한/NNP;의/JKG;미사일/NNG', length=46494, frequency=11, score=0.813089753035318)),\n",
" (('재/XPN', '정취/NNG', '약국/NNG'),\n",
" score(words='재/XPN;정취/NNG;약국/NNG', length=46494, frequency=14, score=0.8102225991026599)),\n",
" (('결시/NNG', '까지/JX', '콜/NNG'),\n",
" score(words='결시/NNG;까지/JX;콜/NNG', length=46494, frequency=25, score=0.8063747933078849)),\n",
" (('끝/NNG', '으로/JKB', '동/NNG'),\n",
" score(words='끝/NNG;으로/JKB;동/NNG', length=46494, frequency=61, score=0.8030470926229916))]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"postag_rules = [[\"/NN\", \"/J\", \"/NN\"], [\"/XP\", \"/NN\"], [\"/XP\", \"/NN\", \"/NN\"]]\n",
"\n",
"ngram.export_ngrams(threshold=0.8, postag_rules=postag_rules)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"이/NP 에/JKB 대해/VV+EC 관련/NNG 부서/NNG 에서/JKB 는/JX 실업/NNG 률/XSN 과/JC 경기/NNG 지수/NNG 와/JKB 의/JKG 상관/NNG 관계/NNG 는/JX 그렇게/MAG 높/VA 지/EC 않/VX 은/ETM 것/NNB 으로/JKB 나타나/VV 며/EC 통계청/NNG 에서/JKB 발표/NNG 하/XSV 는/ETM 공식/NNG 적/XSN 인/VCP+ETM 실업/NNG 률/XSN 을/JKO 구직/NNG 단념/NNG 자/XSN 를/JKO 포함/NNG 한/XSA+ETM 광의/NNG 의/JKG 실업/NNG 률/XSN 로/JKB 전환/NNG 할/XSV+ETM 경우/NNG 오히려/MAJ 경기/NNG 와/JKB 의/JKG 연관/NNG 성/XSN 이/JKS 좀/MAG 더/MAG 높/VA 은/ETM 것/NNB 으로/JKB 분석/NNG 되/XSV 므로/EC 경기/NNG 상환/NNG 판단/NNG 을/JKO 위해서/VV+EC 는/JX 공식/NNG 적/XSN 인/VCP+ETM 실업/NNG 률/XSN 보다/JKB 는/JX 광의/NNG 의/JKG 실업/NNG 률/XSN 을/JKO 이용/NNG 하/XSV 는/ETM 것/NNB 이/JKS 더/MAG 좋/VA 을/ETM 것/NNB 같/VA 다고/EC 설명/NNG 하/XSV 였/EP 음/ETN\n",
"['이/NP', '에/JKB', '대해/VV+', '관련/NNG;부서/NNG', '에서/JKB', '는/JX', '실업/NNG;률/XSN', '과/JC', '경기/NNG', '지수/NNG', '와/JKB', '의/JKG', '상관/NNG;관계/NNG', '는/JX', '그렇게/MAG', '높/VA', '지/EC;않/VX', '은/ETM', '것/NNB;으로/JKB', '나타나/VV', '며/EC', '통계청/NNG;에서/JKB', '발표/NNG', '하/XSV;는/ETM', '공식/NNG', '적/XSN;인/VCP', '실업/NNG;률/XSN', '을/JKO', '구직/NNG', '단념/NNG', '자/XSN', '를/JKO', '포함/NNG;한/XSA', '광의/NNG;의/JKG;실업/NNG', '률/XSN', '로/JKB;전환/NNG', '할/XSV;경우/NNG', '오히려/MAJ', '경기/NNG;와/JKB;의/JKG', '연관/NNG', '성/XSN;이/JKS', '좀/MAG;더/MAG', '높/VA;은/ETM', '것/NNB;으로/JKB', '분석/NNG', '되/XSV', '므로/EC', '경기/NNG', '상환/NNG', '판단/NNG', '을/JKO', '위해서/VV+;는/JX', '공식/NNG', '적/XSN;인/VCP', '실업/NNG;률/XSN', '보다/JKB;는/JX', '광의/NNG;의/JKG;실업/NNG', '률/XSN', '을/JKO', '이용/NNG', '하/XSV', '는/ETM;것/NNB', '이/JKS', '더/MAG', '좋/VA', '을/ETM;것/NNB', '같/VA', '다고/EC', '설명/NNG', '하/XSV', '였/EP;음/ETN']\n"
]
}
],
"source": [
"sentence = ngram.sentences[18]\n",
"print(sentence)\n",
"tokens = ngram.ngramize_sentence(sentence, strip_pos=False, surface_delim=\";\")\n",
"print(tokens)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['이', '에', '대해', '관련부서', '에서', '는', '실업률', '과', '경기', '지수', '와', '의', '상관관계', '는', '그렇게', '높', '지않', '은', '것으로', '나타나', '며', '통계청에서', '발표', '하는', '공식', '적인', '실업률', '을', '구직', '단념', '자', '를', '포함한', '광의의실업', '률', '로전환', '할경우', '오히려', '경기와의', '연관', '성이', '좀더', '높은', '것으로', '분석', '되', '므로', '경기', '상환', '판단', '을', '위해서는', '공식', '적인', '실업률', '보다는', '광의의실업', '률', '을', '이용', '하', '는것', '이', '더', '좋', '을것', '같', '다고', '설명', '하', '였음']\n"
]
}
],
"source": [
"tokens = ngram[sentence]\n",
"print(tokens)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:ekorpkit.models.ngram.ngram:found 25 ngrams\n"
]
},
{
"data": {
"text/plain": [
"{'동/MM;위원/NNG': {'score': 0.8568080788235676, 'count': 1},\n",
" '관련/NNG;하/XSV;여/EC': {'score': 0.6744817132834611, 'count': 1},\n",
" '고용/NNG;사정/NNG': {'score': 0.6440199999085726, 'count': 1},\n",
" '지/EC;않/VX': {'score': 0.9294521637628269, 'count': 2},\n",
" '우리/NP;나라/NNG': {'score': 0.8562398240331328, 'count': 2},\n",
" '실업/NNG;률/XSN': {'score': 0.6494435441593894, 'count': 3},\n",
" '에/JKB;비해/VV+': {'score': 0.5012251931663576, 'count': 2},\n",
" '지/EC;못하/VX': {'score': 0.7024120167866501, 'count': 1},\n",
" '구조/NNG;적/XSN;요인/NNG': {'score': 0.546126756666045, 'count': 3},\n",
" '적/XSN;인/VCP': {'score': 0.7342274264210907, 'count': 1},\n",
" '측면/NNG;에서/JKB': {'score': 0.5471802970887485, 'count': 2},\n",
" '해/XSV;보/VX;면/EC': {'score': 0.7278386924178137, 'count': 1},\n",
" '방향/NNG;으로/JKB;가/VV': {'score': 0.5214619840461885, 'count': 2},\n",
" '는/ETM;것/NNB': {'score': 0.5236409165544453, 'count': 1},\n",
" '다는/ETM;견해/NNG': {'score': 0.7309617813933906, 'count': 1},\n",
" '활동/NNG;참가/NNG': {'score': 0.7741200525059377, 'count': 1},\n",
" '기/ETN;때문/NNB': {'score': 0.8196906558080235, 'count': 1},\n",
" '같/VA;은/ETM': {'score': 0.7999751061740198, 'count': 1},\n",
" '과/JKB;함께/MAG': {'score': 0.6284362488292308, 'count': 1},\n",
" '좀/MAG;더/MAG': {'score': 0.8453728443412079, 'count': 1},\n",
" '심/VV;도/EC': {'score': 0.9942257464499541, 'count': 1},\n",
" '해/XSV;보/VX': {'score': 0.7018456967682065, 'count': 1},\n",
" '필요/NNG;가/JKS;있/VA': {'score': 0.8839755298206097, 'count': 1},\n",
" '의견/NNG;을/JKO;제시/NNG': {'score': 0.7695350089945394, 'count': 1},\n",
" '였/EP;음/ETN': {'score': 0.8196123192196287, 'count': 1}}"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"_ngrams = ngram.find_ngrams(\n",
" ngram.sentences[10],\n",
" strip_pos=False,\n",
" surface_delim=\";\",\n",
" threshold=0.5,\n",
" apply_postag_rules=False,\n",
")\n",
"_ngrams"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"name": "preprocessor.ipynb",
"provenance": []
},
"interpreter": {
"hash": "f869af7787e6a1c49e09e367fc6e1b81d93d1c6583b43249c80edc047bd13cb2"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {
"2121ba5ed3ea4fc6bb6a8c19428365d5": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "ProgressStyleModel",
"state": {
"description_width": ""
}
},
"2388b4a7a5214c0691b12884bd00cbe3": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {}
},
"3de43a7c455a4b51844e33d2745e6c26": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "DescriptionStyleModel",
"state": {
"description_width": ""
}
},
"42d2b104d1ae4d9994c440585e1a985b": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {}
},
"6a07b8108fb14113a6f1077b89e26459": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "HBoxModel",
"state": {
"children": [
"IPY_MODEL_b80c5cb2c5af44609cec75868db8aff2",
"IPY_MODEL_d40f958ef31147b7bdaa875143fd68ad",
"IPY_MODEL_c6432eff63b2411f8dbec9dc4490415c"
],
"layout": "IPY_MODEL_2388b4a7a5214c0691b12884bd00cbe3"
}
},
"759944222fb542108cef600664d77832": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {}
},
"816adaa566d249509b94f2b4255ce473": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {}
},
"a2c5155a7a7a4317bf7ec48c7c8f2819": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "DescriptionStyleModel",
"state": {
"description_width": ""
}
},
"b80c5cb2c5af44609cec75868db8aff2": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "HTMLModel",
"state": {
"layout": "IPY_MODEL_816adaa566d249509b94f2b4255ce473",
"style": "IPY_MODEL_a2c5155a7a7a4317bf7ec48c7c8f2819",
"value": "Tokenizing column: text: 100%"
}
},
"c6432eff63b2411f8dbec9dc4490415c": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "HTMLModel",
"state": {
"layout": "IPY_MODEL_759944222fb542108cef600664d77832",
"style": "IPY_MODEL_3de43a7c455a4b51844e33d2745e6c26",
"value": " 182/182 [00:54<00:00, 2.16it/s]"
}
},
"d40f958ef31147b7bdaa875143fd68ad": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "FloatProgressModel",
"state": {
"bar_style": "success",
"layout": "IPY_MODEL_42d2b104d1ae4d9994c440585e1a985b",
"max": 182,
"style": "IPY_MODEL_2121ba5ed3ea4fc6bb6a8c19428365d5",
"value": 182
}
}
},
"version_major": 2,
"version_minor": 0
}
}
},
"nbformat": 4,
"nbformat_minor": 1
}