Corpus task pipelines#

import logging
from ekorpkit import eKonf

logging.basicConfig(level=logging.WARNING)
print(eKonf.__version__)
0.1.32+2.g8216333.dirty

Apply a pipeline to Corpus#

corpus_cfg = eKonf.compose(config_group="corpus")
corpus_cfg.verbose = False
corpus_cfg.name = "bok_minutes"
corpus_cfg.automerge = True
corpus_cfg.data_dir = "../data"

cfg = eKonf.compose(config_group="pipeline")
cfg.verbose = False
cfg.data.corpus = corpus_cfg
cfg._pipeline_ = ["filter_query", "save_dataframe"]
cfg.filter_query.query = "filename in ['BOK_20181130_20181218']"
cfg.save_dataframe.output_dir = "../data/bok_minutes"
cfg.save_dataframe.output_file = "corpus_filtered.parquet"
data = eKonf.instantiate(cfg)
data.tail()
id text mdate rdate filename
0 0 Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ... 2018-11-30 10:00:00 2018-12-18 16:00:00 BOK_20181130_20181218

Apply a pipeline to Corpora#

corpus_cfg = eKonf.compose(config_group="corpus=corpora")
corpus_cfg.verbose = False
corpus_cfg.name = ["bok_minutes", "fomc_minutes"]
corpus_cfg.automerge = True
corpus_cfg.data_dir = "../data"

cfg = eKonf.compose(config_group="pipeline")
cfg.verbose = False
cfg.data.corpus = corpus_cfg
cfg._pipeline_ = ["filter_query", "save_dataframe"]
cfg.filter_query.query = "id == 0"
cfg.save_dataframe.output_dir = "../data/tmp"
cfg.save_dataframe.output_file = "corpora_filtered.parquet"
data = eKonf.instantiate(cfg)
data.tail()
id text corpus
0 0 Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ... bok_minutes
1 0 A meeting of the Federal Open Market Committee... fomc_minutes