Build and Load Datasets#
from ekorpkit import eKonf
if eKonf.is_colab():
eKonf.mount_google_drive()
ws = eKonf.init_workspace(
workspace="/workspace",
project="ekorpkit-book/exmaples",
task="esg",
log_level="WARNING",
verbose=True
)
print("version:", ws.version)
print("project_dir:", ws.project_dir)
/home/yjlee/.cache/pypoetry/virtualenvs/ekorpkit-rYHZMHZH-py3.8/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
version: 0.1.41
project_dir: /workspace/projects/ekorpkit-book/exmaples
Build a dataset with the ekorpkit configs#
from datasets import load_dataset_builder
ds_builder = load_dataset_builder("rotten_tomatoes")
ds_builder.info.description
ds_builder.info._to_yaml_dict()
{'features': [{'name': 'text', 'dtype': 'string'},
{'name': 'label',
'dtype': {'class_label': {'names': {'0': 'neg', '1': 'pos'}}}}],
'config_name': 'default',
'splits': [{'name': 'train', 'num_bytes': 1074810, 'num_examples': 8530},
{'name': 'validation', 'num_bytes': 134679, 'num_examples': 1066},
{'name': 'test', 'num_bytes': 135972, 'num_examples': 1066}],
'download_size': 487770,
'dataset_size': 1345461}
ds_name = "sst2"
cfg = eKonf.compose("dataset/simple=" + ds_name)
cfg.data_dir = "../data/" + ds_name
cfg.io.data_dir = cfg.data_dir
cfg.io.overwrite = True
cfg.io.calculate_stats = True
db = eKonf.instantiate(cfg)
WARNING:datasets.builder:Reusing dataset glue (/root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Dataset({
features: ['sentence', 'label', 'idx'],
num_rows: 67349
})
WARNING:datasets.builder:Reusing dataset glue (/root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Dataset({
features: ['sentence', 'label', 'idx'],
num_rows: 1821
})
WARNING:datasets.builder:Reusing dataset glue (/root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Dataset({
features: ['sentence', 'label', 'idx'],
num_rows: 872
})
ds_name = "nsmc"
cfg = eKonf.compose("dataset/simple=" + ds_name)
cfg.data_dir = "../data/" + ds_name
cfg.io.data_dir = cfg.data_dir
cfg.io.overwrite = True
cfg.io.calculate_stats = True
db = eKonf.instantiate(cfg)
WARNING:datasets.builder:Using custom data configuration default
WARNING:datasets.builder:Reusing dataset nsmc (/root/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3)
Dataset({
features: ['id', 'document', 'label'],
num_rows: 150000
})
WARNING:datasets.builder:Using custom data configuration default
WARNING:datasets.builder:Reusing dataset nsmc (/root/.cache/huggingface/datasets/nsmc/default/1.1.0/bfd4729bf1a67114e5267e6916b9e4807010aeb238e4a3c2b95fbfa3a014b5f3)
Dataset({
features: ['id', 'document', 'label'],
num_rows: 50000
})
Instantiating datasets#
cfg = eKonf.compose('dataset=datasets')
cfg.datasets = ['nsmc', 'sst2']
cfg.data_dir = '../data'
cfg.verbose = False
ds = eKonf.instantiate(cfg)
print(ds)
Datasets
----------
nsmc
sst2
print(ds.COLUMN)
DatasetInfo :
{'_keys_': {'id': 'id', 'text': 'text', 'split': 'split', 'dataset': 'dataset'}, 'columns': {'id': 'id', 'text': 'text'}, 'datetime': {'columns': None, 'format': None, 'rcParams': None}, 'data': {'id': 'int', 'text': 'str'}}
# ds.concat_datasets()
ds.persist()
WARNING:ekorpkit.datasets.datasets:datasets not concatenated yet, calling concatenate()
print(ds.COLUMN)
DatasetInfo :
{'_keys_': {'id': 'id', 'text': 'text', 'split': 'split', 'dataset': 'dataset'}, 'columns': {'id': 'id', 'text': 'text'}, 'datetime': {'columns': None, 'format': None, 'rcParams': None}, 'data': {'subset': 'object', 'labels': 'object', 'text': 'object', 'id': 'int64', 'split': 'object'}}
print(ds.INFO)
{'features': {'_keys_': {'id': 'id', 'text': 'text', 'split': 'split', 'dataset': 'dataset'}, 'columns': {'id': 'id', 'text': 'text'}, 'datetime': {'columns': None, 'format': None, 'rcParams': None}, 'data': {'id': 'int', 'text': 'str'}, '_target_': 'ekorpkit.info.column.DatasetInfo'}, 'path': {'root': '/workspace/data/None', 'name': None, 'cached_path': None, 'filetype': '.parquet', 'verbose': False, 'data_dir': '../data', 'data_file': None, 'concat_data': False, 'data_columns': None, 'columns': None, 'output_dir': None, 'output_file': None, 'suffix': None, 'cache': {'uri': None, 'extract_archive': True, 'force_extract': False, 'return_parent_dir': True, 'cache_dir': '/workspace/.cache', 'verbose': False, 'path': None}}, 'auto': {'load': True, 'build': False}, 'force': {'rebuild': False}, 'info': {'stats': {'_func_': {'len_bytes': {'_partial_': True, '_target_': 'ekorpkit.utils.func.len_bytes'}}, '_target_': 'ekorpkit.info.stat.summary_stats', '_partial_': True, 'num_workers': 1, 'key_columns': None, 'num_columns': {'num_bytes': 'len_bytes'}, 'agg_funcs': {'num_bytes': ['count', 'sum', 'median', 'max', 'min']}, 'rename_columns': {'num_bytes_count': 'num_examples', 'num_bytes_sum': 'num_bytes'}, 'convert_to_humanbytes': {'num_bytes': 'human_bytes'}, 'text_keys': 'text'}, '_target_': 'ekorpkit.info.stat.SummaryInfo', 'name': None, 'data_dir': '../data', 'info_file': 'info-None.yaml', 'info_list': ['name', 'fullname', 'domain', 'task', 'lang', 'description', 'license', 'homepage', 'version', 'num_examples', 'size_in_bytes', 'size_in_human_bytes', 'data_files_modified', 'info_updated', 'data_files', 'features'], 'update_files_info': {'data_files': 'data_file', 'meta_files': 'meta_file'}, 'update_info': ['fullname', 'lang', 'domain', 'task', 'description', 'license', 'homepage', 'version'], 'modified_info': {'data_files_modified': 'data_file'}, 'key_columns': None, 'verbose': False, 'aggregate_info': {'num_examples': 'num_examples', 'size_in_bytes': 'num_bytes'}}, 'name': 'nsmc-sst2', 'data_dir': '../data', 'filetype': '.parquet', 'use_name_as_subdir': True, 'verbose': False, 'datasets': {'nsmc': <ekorpkit.datasets.dataset.Dataset object at 0x7f6ea7d8ac10>, 'sst2': <ekorpkit.datasets.dataset.Dataset object at 0x7f6ea7d493a0>}}
print(f"Name of a new dataset: {ds.name}")
Name of a new dataset: nsmc-sst2
Instantiating a dataset#
cfg = eKonf.compose('dataset')
cfg.name = 'nsmc-sst2'
cfg.data_dir = '../data'
cfg.verbose = False
ds = eKonf.instantiate(cfg)
print(ds)
Dataset : nsmc-sst2
print(ds.COLUMN)
DatasetInfo :
{'_keys_': {'id': 'id', 'text': 'text', 'split': 'split', 'dataset': 'dataset'}, 'columns': {'id': ['id', 'split'], 'text': 'text'}, 'datetime': {'columns': None, 'format': None, 'rcParams': None}, 'data': {'id': 'int64', 'split': 'object', 'labels': 'object', 'subset': 'object', 'text': 'object', '_id_': 'int64', 'dataset': 'object'}}
ds.splits['train'].dtypes
id int64
split object
labels object
subset object
text object
_id_ int64
dataset object
dtype: object
eKonf.print(ds.INFO)
{'features': {'_keys_': {'dataset': 'dataset',
'id': 'id',
'split': 'split',
'text': 'text'},
'columns': {'id': 'id', 'text': 'text'},
'data': {'id': 'int64',
'labels': 'object',
'split': 'object',
'subset': 'object',
'text': 'object'},
'datetime': {'columns': None,
'format': None,
'rcParams': None}},
'data_files': {'test': 'nsmc-sst2-test.parquet',
'train': 'nsmc-sst2-train.parquet'},
'data_files_modified': '2022-06-13 10:01:36',
'info_updated': '2022-06-14 03:10:52',
'meta_files': {},
'name': 'nsmc-sst2',
'num_examples': 269170,
'size_in_bytes': 21104544,
'size_in_human_bytes': '20.13 MiB',
'splits': {'test': {'data_file': 'nsmc-sst2-test.parquet',
'human_bytes': '4.32 MiB',
'name': 'test',
'num_bytes': 4528486,
'num_bytes_max': 420,
'num_bytes_median': 66.0,
'num_bytes_min': 0,
'num_examples': 51821},
'train': {'data_file': 'nsmc-sst2-train.parquet',
'human_bytes': '15.81 MiB',
'name': 'train',
'num_bytes': 16576058,
'num_bytes_max': 420,
'num_bytes_median': 57.0,
'num_bytes_min': 0,
'num_examples': 217349}}}
ds.splits
{'train': id split labels subset \
index
0 0 train negative None
1 1 train positive None
2 2 train negative None
3 3 train negative None
4 4 train positive None
... ... ... ... ...
217344 217344 train positive sst2
217345 217345 train negative sst2
217346 217346 train positive sst2
217347 217347 train positive sst2
217348 217348 train negative sst2
text _id_ dataset
index
0 아 더빙.. 진짜 짜증나네요 목소리 0 nsmc
1 흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나 1 nsmc
2 너무재밓었다그래서보는것을추천한다 2 nsmc
3 교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정 3 nsmc
4 사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ... 4 nsmc
... ... ... ...
217344 a delightful comedy 67344 sst2
217345 anguish , anger and frustration 67345 sst2
217346 at achieving the modest , crowd-pleasing goals... 67346 sst2
217347 a patient viewer 67347 sst2
217348 this new jangle of noise , mayhem and stupidit... 67348 sst2
[217349 rows x 7 columns],
'test': id split labels subset \
0 0 test positive None
1 1 test negative None
2 2 test negative None
3 3 test negative None
4 4 test negative None
... ... ... ... ...
51816 51816 test None sst2
51817 51817 test None sst2
51818 51818 test None sst2
51819 51819 test None sst2
51820 51820 test None sst2
text _id_ dataset
0 굳 ㅋ 0 nsmc
1 GDNTOPCLASSINTHECLUB 1 nsmc
2 뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아 2 nsmc
3 지루하지는 않은데 완전 막장임... 돈주고 보기에는.... 3 nsmc
4 3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠?? 4 nsmc
... ... ... ...
51816 it risks seeming slow and pretentious , becaus... 1816 sst2
51817 take care of my cat offers a refreshingly diff... 1817 sst2
51818 davis has filled out his cast with appealing f... 1818 sst2
51819 it represents better-than-average movie-making... 1819 sst2
51820 dazzling and sugar-sweet , a blast of shallow ... 1820 sst2
[51821 rows x 7 columns]}