Build and Load Corpora#
from ekorpkit import eKonf
if eKonf.is_colab():
eKonf.mount_google_drive()
ws = eKonf.init_workspace(
workspace="/workspace",
project="ekorpkit/exmaples",
task="corpus",
log_level="WARNING",
verbose=True,
)
print("version:", ws.version)
print("project_dir:", ws.project_dir)
/home/yjlee/.cache/pypoetry/virtualenvs/ekorpkit-rYHZMHZH-py3.8/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
version: 0.1.41
project_dir: /workspace/projects/ekorpkit/exmaples
cfg = eKonf.compose('task/datasets=builder')
cfg
{'batch': {'batch_name': '${oc.select:..name,demo-batch}', 'batch_num': None, 'output_dir': '${oc.select:..path.output_dir,null}', 'output_suffix': None, 'output_extention': None, 'random_seed': True, 'seed': None, 'resume_run': False, 'resume_latest': False, 'run_to_resume': 'latest', 'verbose': '${oc.select:..project.verbose,false}', 'num_workers': '${oc.select:num_workers,1}', 'device': 'cpu', 'num_devices': None}, 'project': {'dotenv': {'dotenv_path': '${oc.select:..path.dotenv_path,.env}'}, 'joblib': {'distributed_framework': {'backend': 'joblib', 'initialize': True, 'num_workers': '${oc.select:num_workers,50}'}, 'batcher': {'procs': '${oc.select:..distributed_framework.num_workers,50}', 'minibatch_size': 1000, 'backend': '${..distributed_framework.backend}', 'task_num_cpus': 1, 'task_num_gpus': 0, 'verbose': 10}}, 'path': {'dotenv_path': '${.runtime}/.env', 'workspace': '${oc.select:..workspace_root,${.home}/.ekorpkit}', 'project': '${.workspace}/projects/${oc.select:..project_name,${oc.env:EKORPKIT_PROJECT_NAME,ekorpkit-default}}', 'data': '${alt:${oc.env:EKORPKIT_DATA_ROOT,null},${.workspace}/data}', 'home': '${__home_path__:}', 'hyfi': '${__hyfi_path__:}', 'ekorpkit': '${__ekorpkit_path__:}', 'resources': '${alt:${oc.env:EKORPKIT_RESOURCE_DIR,null},${.ekorpkit}/resources}', 'runtime': '${get_original_cwd:}', 'archive': '${.data}/archive', 'datasets': '${.data}/datasets', 'logs': '${.project}/logs', 'models': '${.data}/models', 'outputs': '${.project}/outputs', 'cache': '${.workspace}/.cache', 'tmp': '${.workspace}/.tmp', 'library': '${join_path:${.data}, libs}'}, 'project_name': '${oc.env:EKORPKIT_PROJECT_NAME,hyfi-project}', 'task_name': '${oc.env:EKORPKIT_PROJECT_NAME,datasets}', 'project_root': '${.path.project}', 'workspace_root': '${alt:${oc.env:EKORPKIT_WORKSPACE_ROOT,null},${.path.home}/.hyfi}', 'description': None, 'use_huggingface_hub': True, 'use_wandb': False, 'verbose': False}, 'path': {'task_name': '${oc.select:..project.task_name, default-task}', 'root': '${join_path:${oc.select:..project.project_root,${oc.env:EKORPKIT_PROJECT_ROOT,./}}, ${.task_name}}', 'verbose': '${oc.select:..batch.verbose, false}', 'output_dir': '${join_path:${.root}, outputs}', 'data_dir': '${join_path:${.root}, data}', 'library_dir': '${join_path:${.root}, libs}', 'model_dir': '${join_path:${.root}, models}', 'cache_dir': '${oc.select:..project.path.cache,${join_path:${.root}, cache}}', 'tmp_dir': '${oc.select:..project.path.tmp,${join_path:${.root}, tmp}}', 'log_dir': '${oc.select:..project.path.logs,${join_path:${.root}, logs}}', 'batch_name': '${oc.select:..batch.batch_name, demo}', 'batch_dir': '${.output_dir}/${.batch_name}'}, '_target_': 'ekorpkit.datasets.builder.DatasetBuilder', 'name': 'dataset-builder', 'auto': True, 'force': False, 'verbose': '${oc.select:..verbose, false}'}
from ekorpkit.datasets.builder import DatasetBuilder
builder = DatasetBuilder(**cfg)
builder.force
'False'
print(builder)
config_name=None config_group=None name='dataset-builder' path=PathConfig(task_name='datasets', root='/home/yjlee/.hyfi/projects/hyfi-project/datasets', batch_name='dataset-builder', verbose=False) project=ProjectConfig(config_name='__init__', project_name='hyfi-project', task_name='datasets', workspace_root='/home/yjlee/.hyfi', project_root='/home/yjlee/.hyfi/projects/hyfi-project', description=None, use_huggingface_hub=True, use_wandb=False, verbose=False, dotenv=DotEnvConfig(HYFI_WORKSPACE_ROOT='/workspace', HYFI_PROJECT_NAME='ekorpkit/exmaples', HYFI_TASK_NAME='corpus', HYFI_PROJECT_ROOT='/workspace/projects/ekorpkit/exmaples', HYFI_DATA_ROOT='/workspace/data', HYFI_RESOURCE_DIR=None, HYFI_LOG_LEVEL='WARNING', HYFI_VERBOSE=True, NUM_WORKERS=None, CACHED_PATH_CACHE_ROOT='/workspace/.cache/cached_path', CUDA_DEVICE_ORDER=None, CUDA_VISIBLE_DEVICES=None, WANDB_PROJECT='ekorpkit-exmaples', WANDB_DISABLED=None, WANDB_DIR='/workspace/projects/ekorpkit/exmaples/logs', WANDB_NOTEBOOK_NAME='/workspace/projects/ekorpkit/exmaples/logs/corpus-nb', WANDB_SILENT=False, LABEL_STUDIO_SERVER=None, KMP_DUPLICATE_LIB_OK='True', TOKENIZERS_PARALLELISM=False, WANDB_API_KEY=None, HUGGING_FACE_HUB_TOKEN=None, ECOS_API_KEY=None, FRED_API_KEY=None, NASDAQ_API_KEY=None, HF_USER_ACCESS_TOKEN=None, LABEL_STUDIO_USER_TOKEN=None, dotenv_path='/workspace/projects/ekorpkit/book/tutorials/corpora/.env'), joblib=JobLibConfig(config_name='__init__', distributed_framework=DistFramwork(backend='joblib', initialize=True, num_workers=50), batcher=BatcherConfig(procs=50, minibatch_size=1000, backend='joblib', task_num_cpus=1, task_num_gpus=0, verbose=10)), path=PathConfig(config_name='__init__', dotenv_path='/workspace/projects/ekorpkit/book/tutorials/corpora/.env', workspace='/home/yjlee/.hyfi', project='/home/yjlee/.hyfi/projects/hyfi-project', data='/home/yjlee/.hyfi/data', home='/home/yjlee', hyfi='/home/yjlee/.cache/pypoetry/virtualenvs/ekorpkit-rYHZMHZH-py3.8/lib/python3.8/site-packages/hyfi', resources='/workspace/projects/ekorpkit/ekorpkit/resources', runtime='/workspace/projects/ekorpkit/book/tutorials/corpora', archive='/home/yjlee/.hyfi/data/archive', corpus=None, datasets='/home/yjlee/.hyfi/data/datasets', logs='/home/yjlee/.hyfi/projects/hyfi-project/logs', models='/home/yjlee/.hyfi/data/models', outputs='/home/yjlee/.hyfi/projects/hyfi-project/outputs', cache='/home/yjlee/.hyfi/.cache', tmp='/home/yjlee/.hyfi/.tmp', library='/home/yjlee/.hyfi/data/libs', verbose=False, ekorpkit='/workspace/projects/ekorpkit/ekorpkit')) module=None auto='True' force='False' autoload=False version='0.0.0' batch=BaseBatchConfig(batch_name='dataset-builder', batch_num=0, output_dir=PosixPath('/home/yjlee/.hyfi/projects/hyfi-project/datasets/outputs'), output_suffix=None, output_extention='', random_seed=True, seed=2974550991, resume_run=False, resume_latest=False, num_workers=1, device='cpu', num_devices=None, verbose=False, config_yaml='config.yaml', config_json='config.json', config_dirname='configs') filetype='.parquet' features={} _target_='ekorpkit.datasets.builder.DatasetBuilder' verbose=False root_dir=None batch_num=0
Build corpora with the ekorpkit configs#
cfg = eKonf.compose('corpus/builtin=_dummy_bok_minutes')
cfg.data_dir = '../data/bok_minutes'
cfg.verbose = True
# eKonf.print(cfg)
db = eKonf.instantiate(cfg)
WARNING:ekorpkit.io.file:No files found for bok_minutes-train.parquet
WARNING:ekorpkit.datasets.build:No datasets found
id filename mdate \
0 BOK_20181130_20181218_S1 BOK_20181130_20181218 2018-11-30 10:00:00
1 BOK_20181130_20181218_S2 BOK_20181130_20181218 2018-11-30 10:00:00
2 BOK_20181130_20181218_S3 BOK_20181130_20181218 2018-11-30 10:00:00
3 BOK_20181130_20181218_S4 BOK_20181130_20181218 2018-11-30 10:00:00
4 BOK_20181130_20181218_S5 BOK_20181130_20181218 2018-11-30 10:00:00
rdate section \
0 2018-12-18 16:00:00 Economic Situation
1 2018-12-18 16:00:00 Foreign Currency
2 2018-12-18 16:00:00 Financial Markets
3 2018-12-18 16:00:00 Monetary Policy
4 2018-12-18 16:00:00 Participants’ Views
text
0 일부 위원은 관련부서에서 지난 3/4분기 중 유로지역 경제성장 부진을 자동차 관련 ...
1 일부 위원은 그동안 글로벌펀드와 패시브펀드의 규모가 크게 확대되어 우리나라 자본유출...
2 일부 위원은 현재 대기업들이 전반적으로는 문제가 없지만, 건설 조선업 등에 속하는 ...
3 일부 위원은 최근 경기상황과 금융불균형 등을 고려할 때 확장적 재정정책의 필요성에는...
4 일부 위원은 최근 실물경제 성장경로의 하방위험이 다소 커진 것으로 보이고 물가도 상...
{'category': 'formal',
'column_info': {'_keys_': {'dataset': 'dataset',
'id': 'id',
'split': 'split',
'text': 'text',
'timestamp': 'timestamp'},
'columns': {'id': 'id',
'merge_meta_on': 'id',
'text': 'text',
'timestamp': None},
'data': {'id': 'int', 'text': 'str'},
'datetime': {'columns': None,
'format': None,
'rcParams': None},
'meta': {'filename': 'str',
'id': 'int',
'mdate': 'str',
'rdate': 'str'},
'segment_separator': '\\n\\n',
'sentence_separator': '\\n',
'timestamp': {'format': None, 'key': None, 'rcParams': None}},
'description': 'BOK Minutes Corpus',
'fullname': 'BOK MPB Minutes',
'homepage': 'https://www.bok.or.kr',
'info_updated': '2022-06-15 08:49:00',
'lang': 'ko',
'license': 'Bank of Korea',
'name': 'bok_minutes',
'version': '1.0.0'}
{'category': 'formal',
'column_info': {'_keys_': {'dataset': 'dataset',
'id': 'id',
'split': 'split',
'text': 'text',
'timestamp': 'timestamp'},
'columns': {'id': 'id',
'merge_meta_on': 'id',
'text': 'text',
'timestamp': None},
'data': {'id': 'int', 'text': 'str'},
'datetime': {'columns': None,
'format': None,
'rcParams': None},
'meta': {'filename': 'str',
'id': 'int',
'mdate': 'str',
'rdate': 'str'},
'segment_separator': '\\n\\n',
'sentence_separator': '\\n',
'timestamp': {'format': None, 'key': None, 'rcParams': None}},
'description': 'BOK Minutes Corpus',
'fullname': 'BOK MPB Minutes',
'homepage': 'https://www.bok.or.kr',
'info_updated': '2022-06-15 09:00:59',
'lang': 'ko',
'license': 'Bank of Korea',
'name': 'bok_minutes',
'version': '1.0.0'}
cfg = eKonf.compose("corpus/builtin=_dummy_fomc_minutes")
cfg.data_dir = "../data/fomc_minutes"
db = eKonf.instantiate(cfg)
db.build()
Instantiating corpora#
cfg = eKonf.compose('corpus=corpora')
cfg.name = ['bok_minutes', 'fomc_minutes']
cfg.data_dir = '../data'
cfg.auto.load = True
crps = eKonf.instantiate(cfg)
print(crps)
Corpora
----------
bok_minutes
fomc_minutes
crps['bok_minutes'].data
id | text | split | |
---|---|---|---|
index | |||
0 | 0 | Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ... | train |
crps['fomc_minutes'].data
id | text | content_type | split | |
---|---|---|---|---|
index | ||||
0 | 0 | A meeting of the Federal Open Market Committee... | fomc_minutes | train |
1 | 1 | A meeting of the Federal Open Market Committee... | fomc_minutes | train |
2 | 2 | A meeting of the Federal Open Market Committee... | fomc_minutes | train |
3 | 3 | A meeting of the Federal Open Market Committee... | fomc_minutes | train |
4 | 4 | A meeting of the Federal Open Market Committee... | fomc_minutes | train |
crps.concat_corpora()
crps.data
id | text | split | corpus | content_type | |
---|---|---|---|---|---|
0 | 0 | Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ... | train | bok_minutes | NaN |
1 | 0 | A meeting of the Federal Open Market Committee... | train | fomc_minutes | fomc_minutes |
2 | 1 | A meeting of the Federal Open Market Committee... | train | fomc_minutes | fomc_minutes |
3 | 2 | A meeting of the Federal Open Market Committee... | train | fomc_minutes | fomc_minutes |
4 | 3 | A meeting of the Federal Open Market Committee... | train | fomc_minutes | fomc_minutes |
5 | 4 | A meeting of the Federal Open Market Committee... | train | fomc_minutes | fomc_minutes |
crps.metadata
id | mdate | rdate | filename | split | corpus | date | speaker | title | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2018-11-30 10:00:00 | 2018-12-18 16:00:00 | BOK_20181130_20181218 | train | bok_minutes | NaN | NaN | NaN |
1 | 0 | NaN | NaN | NaN | train | fomc_minutes | 1993-02-03 | Alan Greenspan | FOMC Meeting Minutes |
2 | 1 | NaN | NaN | NaN | train | fomc_minutes | 1993-03-23 | Alan Greenspan | FOMC Meeting Minutes |
3 | 2 | NaN | NaN | NaN | train | fomc_minutes | 1993-05-18 | Alan Greenspan | FOMC Meeting Minutes |
4 | 3 | NaN | NaN | NaN | train | fomc_minutes | 1993-07-07 | Alan Greenspan | FOMC Meeting Minutes |
5 | 4 | NaN | NaN | NaN | train | fomc_minutes | 1993-08-17 | Alan Greenspan | FOMC Meeting Minutes |
Instantiating a corpus#
cfg = eKonf.compose('corpus')
cfg.name = 'bok_minutes'
cfg.data_dir = '../data'
cfg.column_info.timestamp.key = 'mdate'
crps = eKonf.instantiate(cfg)
print(crps)
Corpus : bok_minutes
crps.data
id | text | split | |
---|---|---|---|
index | |||
0 | 0 | Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ... | train |
crps.metadata
id | mdate | rdate | filename | split | |
---|---|---|---|---|---|
0 | 0 | 2018-11-30 10:00:00 | 2018-12-18 16:00:00 | BOK_20181130_20181218 | train |
print(crps.ID, crps.IDs, crps.TEXT, crps.DATA, crps.METADATA)
id ['id', 'split'] text ['id', 'text', 'split'] ['id', 'mdate', 'rdate', 'filename', 'split']
crps.merge_metadata()
crps.data
id | text | split | mdate | rdate | filename | |
---|---|---|---|---|---|---|
0 | 0 | Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ... | train | 2018-11-30 10:00:00 | 2018-12-18 16:00:00 | BOK_20181130_20181218 |
crps.COLUMN.TIMESTAMP_INFO.key = 'mdate'
crps.load_timestamp()
crps.data
id | text | split | mdate | rdate | filename | timestamp | |
---|---|---|---|---|---|---|---|
0 | 0 | Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ... | train | 2018-11-30 10:00:00 | 2018-12-18 16:00:00 | BOK_20181130_20181218 | 2018-11-30 10:00:00 |
eKonf.pprint(crps.INFO)
{'category': 'formal',
'column_info': {'_keys_': {'dataset': 'dataset',
'id': 'id',
'split': 'split',
'text': 'text',
'timestamp': 'timestamp'},
'columns': {'id': 'id',
'merge_meta_on': 'id',
'text': 'text',
'timestamp': None},
'data': {'id': 'int', 'text': 'str'},
'datetime': {'columns': None,
'format': None,
'rcParams': None},
'meta': {'filename': 'str',
'id': 'int',
'mdate': 'str',
'rdate': 'str'},
'segment_separator': '\\n\\n',
'sentence_separator': '\\n',
'timestamp': {'format': None, 'key': None, 'rcParams': None}},
'data_files': {'train': 'bok_minutes-train.parquet'},
'data_files_modified': '2022-06-14 02:24:20',
'description': 'BOK Minutes Corpus',
'fullname': 'BOK MPB Minutes',
'homepage': 'https://www.bok.or.kr',
'info_updated': '2022-06-14 02:24:21',
'lang': 'ko',
'license': 'Bank of Korea',
'meta_files': {'train': 'meta-bok_minutes-train.parquet'},
'meta_files_modified': '2022-06-14 02:24:20',
'name': 'bok_minutes',
'num_bytes_before_processing': 88948,
'num_docs': 1,
'num_docs_before_processing': 1,
'num_segments': 5,
'num_sents': 346,
'num_words': 8171,
'size_in_bytes': 88925,
'size_in_human_bytes': '86.84 KiB',
'splits': {'train': {'data_file': 'bok_minutes-train.parquet',
'dataset_name': 'bok_minutes',
'human_bytes': '86.84 KiB',
'human_bytes_wospc': '78.86 KiB',
'meta_file': 'meta-bok_minutes-train.parquet',
'name': 'train',
'num_bytes': 88925,
'num_bytes_before_processing': 88948,
'num_bytes_max': 88925,
'num_bytes_median': 88925.0,
'num_bytes_min': 88925,
'num_bytes_wospc': 80751,
'num_docs': 1,
'num_docs_before_processing': 1,
'num_segments': 5,
'num_segments_median': 5.0,
'num_sents': 346,
'num_sents_median': 346.0,
'num_words': 8171,
'num_words_max': 8171,
'num_words_median': 8171.0,
'num_words_min': 8171}},
'version': '1.0.0'}