# Mecab Tokenizer

In [1]:
from ekorpkit import eKonf

if eKonf.is_colab():
    eKonf.mount_google_drive()
ws = eKonf.set_workspace(
    workspace="/workspace", 
    project="ekorpkit-book/exmaples", 
    task="tutorials", 
    log_level="INFO",
    verbose=True
)
print("version:", ws.version)
print("project_dir:", ws.project_dir)

INFO:ekorpkit.base:Set environment variable EKORPKIT_DATA_ROOT=/workspace/data
INFO:ekorpkit.base:Set environment variable CACHED_PATH_CACHE_ROOT=/workspace/.cache/cached_path
INFO:ekorpkit.base:Set environment variable WANDB_DIR=/workspace/projects/ekorpkit-book/exmaples/logs
INFO:ekorpkit.base:Set environment variable WANDB_PROJECT=ekorpkit-book-exmaples
INFO:ekorpkit.base:Set environment variable WANDB_NOTEBOOK_NAME=/workspace/projects/ekorpkit-book/exmaples/logs/tutorials-nb
INFO:ekorpkit.base:Set environment variable WANDB_SILENT=False


version: 0.1.40.post0.dev88
project_dir: /workspace/projects/ekorpkit-book/exmaples
time: 967 ms (started: 2023-02-06 06:37:34 +00:00)


## Instantiating a mecab class

In [2]:
config_group='preprocessor/tokenizer=mecab'
cfg = eKonf.compose(config_group=config_group)
mecab = eKonf.instantiate(cfg)

INFO:ekorpkit.preprocessors.tokenizer:Initializing mecab with {'userdic_path': None, 'backend': 'mecab-python3', 'verbose': False}...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...


time: 756 ms (started: 2023-02-06 06:37:36 +00:00)


In [3]:
text = 'IMF가 推定한 우리나라의 GDP갭률은 今年에도 소폭의 마이너스(−)를 持續하고 있다.'
tokens = mecab.tokenize(text)
print(tokens)
print(mecab(text))

['IMF/SL', '가/JKS', '/SP', '推定/NNG', '한/XSA+ETM', '/SP', '우리나라/NNG', '의/JKG', '/SP', 'GDP/SL', '갭/NNG', '률/XSN', '은/JX', '/SP', '今年/NNG', '에/JKB', '도/JX', '/SP', '소폭/NNG', '의/JKG', '/SP', '마이너스/NNG', '(/SSO', '−)/SY', '를/JKO', '/SP', '持續/NNG', '하/XSV', '고/EC', '/SP', '있/VX', '다/EF', './SF']
['IMF/SL', '가/JKS', '/SP', '推定/NNG', '한/XSA+ETM', '/SP', '우리나라/NNG', '의/JKG', '/SP', 'GDP/SL', '갭/NNG', '률/XSN', '은/JX', '/SP', '今年/NNG', '에/JKB', '도/JX', '/SP', '소폭/NNG', '의/JKG', '/SP', '마이너스/NNG', '(/SSO', '−)/SY', '를/JKO', '/SP', '持續/NNG', '하/XSV', '고/EC', '/SP', '있/VX', '다/EF', './SF']
time: 4.36 ms (started: 2023-02-06 06:37:37 +00:00)


In [4]:
config_group='preprocessor/normalizer=formal_ko'
cfg_norm = eKonf.compose(config_group=config_group)
norm = eKonf.instantiate(cfg_norm)

time: 799 ms (started: 2023-02-06 06:37:37 +00:00)


In [5]:
norm(text)

'IMF가 추정한 우리나라의 GDP갭률은 금년에도 소폭의 마이너스(-)를 지속하고 있다.'

time: 3.48 ms (started: 2023-02-06 06:37:38 +00:00)


In [6]:
config_group='preprocessor/tokenizer=mecab'
cfg = eKonf.compose(config_group=config_group)
mecab = eKonf.instantiate(cfg, normalize=norm)
tokens = mecab.tokenize(text)
print(tokens)

INFO:ekorpkit.preprocessors.tokenizer:Initializing mecab with {'userdic_path': None, 'backend': 'mecab-python3', 'verbose': False}...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...


['IMF/SL', '가/JKS', '/SP', '추정/NNG', '한/XSA+ETM', '/SP', '우리나라/NNG', '의/JKG', '/SP', 'GDP/SL', '갭/NNG', '률/XSN', '은/JX', '/SP', '금년/NNG', '에/JKB', '도/JX', '/SP', '소폭/NNG', '의/JKG', '/SP', '마이너스/NNG', '(/SSO', '-)/SY', '를/JKO', '/SP', '지속/NNG', '하/XSV', '고/EC', '/SP', '있/VX', '다/EF', './SF']
time: 329 ms (started: 2023-02-06 06:37:38 +00:00)


In [7]:
config_group='preprocessor/tokenizer=mecab'
cfg = eKonf.compose(config_group=config_group)
cfg.normalize = cfg_norm
mecab = eKonf.instantiate(cfg)
tokens = mecab.tokenize(text)
print(tokens)

INFO:ekorpkit.preprocessors.tokenizer:Initializing mecab with {'userdic_path': None, 'backend': 'mecab-python3', 'verbose': False}...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...


['IMF/SL', '가/JKS', '/SP', '추정/NNG', '한/XSA+ETM', '/SP', '우리나라/NNG', '의/JKG', '/SP', 'GDP/SL', '갭/NNG', '률/XSN', '은/JX', '/SP', '금년/NNG', '에/JKB', '도/JX', '/SP', '소폭/NNG', '의/JKG', '/SP', '마이너스/NNG', '(/SSO', '-)/SY', '를/JKO', '/SP', '지속/NNG', '하/XSV', '고/EC', '/SP', '있/VX', '다/EF', './SF']
time: 338 ms (started: 2023-02-06 06:37:38 +00:00)


In [8]:
print(mecab.nouns(text))

['IMF', '추정', '우리나라', 'GDP', '갭', '률', '금년', '소폭', '마이너스', '지속']
time: 3.18 ms (started: 2023-02-06 06:37:39 +00:00)


In [9]:
print(mecab.morphs(text))

['IMF', '가', '추정', '한', '우리나라', '의', 'GDP', '갭', '률', '은', '금년', '에', '도', '소폭', '의', '마이너스', '를', '지속', '하', '고', '있', '다']
time: 3.91 ms (started: 2023-02-06 06:37:39 +00:00)


In [10]:
text = '금통위는 통화신용정책과 한국은행의 운영에 관한 의결권을 행사한다.'
print(mecab.tokenize(text))

['금/MAJ', '통/MAG', '위/NNG', '는/JX', '/SP', '통화/NNG', '신용/NNG', '정책/NNG', '과/JC', '/SP', '한국은행/NNP', '의/JKG', '/SP', '운영/NNG', '에/JKB', '/SP', '관한/VV+ETM', '/SP', '의결/NNG', '권/XSN', '을/JKO', '/SP', '행사/NNG', '한다/XSV+EF', './SF']
time: 3.07 ms (started: 2023-02-06 06:37:39 +00:00)


In [20]:
config_group='preprocessor/tokenizer=mecab_econ'
cfg = eKonf.compose(config_group=config_group)
mecab = eKonf.instantiate(cfg)
tokens = mecab.tokenize(text)
print(tokens)

INFO:ekorpkit.preprocessors.tokenizer:Initializing mecab with {'userdic_path': '/workspace/projects/ekorpkit/ekorpkit/resources/dictionaries/mecab/ekon_v1.dic', 'backend': 'mecab-python3', 'verbose': False}...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...


['금통위/NNP', '는/JX', '/SP', '통화/NNG', '신용/NNG', '정책/NNG', '과/JC', '/SP', '한국은행/NNP', '의/JKG', '/SP', '운영/NNG', '에/JKB', '/SP', '관한/VV+ETM', '/SP', '의결권/NNP', '을/JKO', '/SP', '행사/NNG', '한다/XSV+EF', './SF']
time: 273 ms (started: 2023-02-06 06:49:13 +00:00)
