Instantiating pipeline#

import logging
from ekorpkit import eKonf

logging.basicConfig(level=logging.INFO)
print("version:", eKonf.__version__)
print("is notebook?", eKonf.is_notebook())
print("is colab?", eKonf.is_colab())
print("evironment varialbles:")
eKonf.print(eKonf.env().dict())
INFO:ekorpkit.base:IPython version: (6, 9, 0), client: jupyter_client
INFO:ekorpkit.base:Google Colab not detected.
version: 0.1.33+21.gf33ff55.dirty
is notebook? True
is colab? False
evironment varialbles:
{'EKORPKIT_CONFIG_DIR': '/workspace/projects/ekorpkit-book/config',
 'EKORPKIT_DATA_DIR': None,
 'EKORPKIT_PROJECT': 'ekorpkit-book',
 'EKORPKIT_WORKSPACE_ROOT': '/workspace',
 'NUM_WORKERS': 230}

Load data#

cfg = eKonf.compose("data")
cfg.path.cache.uri = "https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/edgar.zip"
cfg.data_dir = cfg.path.cached_path
cfg.data_dir += "/edgar"
cfg.data_file = "edgar.parquet"
data = eKonf.instantiate(cfg)
print(data)
INFO:cached_path:cache of https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/edgar.zip is up-to-date
INFO:ekorpkit.io.cached_path:cached path: /workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted
INFO:cached_path:cache of https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/edgar.zip is up-to-date
INFO:ekorpkit.io.cached_path:cached path: /workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted
INFO:cached_path:cache of https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/edgar.zip is up-to-date
INFO:ekorpkit.io.cached_path:cached path: /workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted
INFO:ekorpkit.io.file:Processing [1] files from ['edgar.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted/edgar/edgar.parquet']
INFO:ekorpkit.io.file:Loading data from /workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted/edgar/edgar.parquet
Data: edgar.parquet<class 'pandas.core.frame.DataFrame'>
df_cfg = eKonf.compose("pipeline=blank")
df_cfg.name = "edgar_sample"
df = eKonf.instantiate(df_cfg, data=data)
df.head()
WARNING:ekorpkit.pipelines.pipe:No pipeline specified
id filename item text cik company filing_type filing_date period_of_report sic state_of_inc state_location fiscal_year_end filing_html_index htm_filing_link complete_text_filing_link
1410 1534 1999/320193_10K_1999_0000912057-99-010244.json item_1 ITEM 1. \nBUSINESS GENERAL Apple Computer, Inc... 320193 APPLE COMPUTER INC 10-K 1999-12-22 1999-09-25 3571 CA CA 0930 https://www.sec.gov/Archives/edgar/data/320193... None https://www.sec.gov/Archives/edgar/data/320193...
1560 1697 1999/21344_10K_1999_0000021344-00-000009.json item_1 ITEM 1. \nBUSINESS The Coca-Cola Company (toge... 21344 COCA COLA CO 10-K 2000-03-09 1999-12-31 2080 DE GA 1231 https://www.sec.gov/Archives/edgar/data/21344/... None https://www.sec.gov/Archives/edgar/data/21344/...
2746 2977 1999/70858_10K_1999_0000950168-00-000621.json item_1 Item 1. \nBUSINESS General Bank of America Cor... 70858 BANK OF AMERICA CORP /DE/ 10-K 2000-03-20 1999-12-31 6021 DE NC 1231 https://www.sec.gov/Archives/edgar/data/70858/... None https://www.sec.gov/Archives/edgar/data/70858/...
3762 4088 1999/80424_10K_1999_0000080424-99-000027.json item_1 Item 1. \nBusiness. \n--------- General Develo... 80424 PROCTER & GAMBLE CO 10-K 1999-09-15 1999-06-30 2840 OH OH 0630 https://www.sec.gov/Archives/edgar/data/80424/... None https://www.sec.gov/Archives/edgar/data/80424/...
4806 5211 1999/1018724_10K_1999_0000891020-00-000622.json item_1 ITEM 1. \nBUSINESS This Annual Report on Form ... 1018724 AMAZON COM INC 10-K 2000-03-29 1999-12-31 5961 DE WA 1231 https://www.sec.gov/Archives/edgar/data/101872... None https://www.sec.gov/Archives/edgar/data/101872...
df_cfg = eKonf.compose("pipeline=blank")
df_cfg.name = "edgar_sample"
df_cfg.path.cache.uri = "https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/edgar.zip"
df_cfg.data_dir = df_cfg.path.cached_path
df_cfg.data_dir += "/edgar"
df_cfg.data_file = "edgar.parquet"
df_cfg.data_columns = ["id", "filename", "item", "cik", "company", "text"]
df = eKonf.instantiate(df_cfg)
df.head()
INFO:cached_path:cache of https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/edgar.zip is up-to-date
INFO:ekorpkit.io.cached_path:cached path: /workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted
INFO:cached_path:cache of https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/edgar.zip is up-to-date
INFO:ekorpkit.io.cached_path:cached path: /workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted
INFO:cached_path:cache of https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/edgar.zip is up-to-date
INFO:ekorpkit.io.cached_path:cached path: /workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted
INFO:ekorpkit.io.file:Processing [1] files from ['edgar.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted/edgar/edgar.parquet']
INFO:ekorpkit.io.file:Loading data from /workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted/edgar/edgar.parquet
WARNING:ekorpkit.pipelines.pipe:No pipeline specified
id filename item text cik company filing_type filing_date period_of_report sic state_of_inc state_location fiscal_year_end filing_html_index htm_filing_link complete_text_filing_link
1410 1534 1999/320193_10K_1999_0000912057-99-010244.json item_1 ITEM 1. \nBUSINESS GENERAL Apple Computer, Inc... 320193 APPLE COMPUTER INC 10-K 1999-12-22 1999-09-25 3571 CA CA 0930 https://www.sec.gov/Archives/edgar/data/320193... None https://www.sec.gov/Archives/edgar/data/320193...
1560 1697 1999/21344_10K_1999_0000021344-00-000009.json item_1 ITEM 1. \nBUSINESS The Coca-Cola Company (toge... 21344 COCA COLA CO 10-K 2000-03-09 1999-12-31 2080 DE GA 1231 https://www.sec.gov/Archives/edgar/data/21344/... None https://www.sec.gov/Archives/edgar/data/21344/...
2746 2977 1999/70858_10K_1999_0000950168-00-000621.json item_1 Item 1. \nBUSINESS General Bank of America Cor... 70858 BANK OF AMERICA CORP /DE/ 10-K 2000-03-20 1999-12-31 6021 DE NC 1231 https://www.sec.gov/Archives/edgar/data/70858/... None https://www.sec.gov/Archives/edgar/data/70858/...
3762 4088 1999/80424_10K_1999_0000080424-99-000027.json item_1 Item 1. \nBusiness. \n--------- General Develo... 80424 PROCTER & GAMBLE CO 10-K 1999-09-15 1999-06-30 2840 OH OH 0630 https://www.sec.gov/Archives/edgar/data/80424/... None https://www.sec.gov/Archives/edgar/data/80424/...
4806 5211 1999/1018724_10K_1999_0000891020-00-000622.json item_1 ITEM 1. \nBUSINESS This Annual Report on Form ... 1018724 AMAZON COM INC 10-K 2000-03-29 1999-12-31 5961 DE WA 1231 https://www.sec.gov/Archives/edgar/data/101872... None https://www.sec.gov/Archives/edgar/data/101872...

Process a pipeline with the ekorpkit configs#

corpus_cfg = eKonf.compose("corpus")
corpus_cfg.name = "bok_minutes"
corpus_cfg.data_dir = "../data"
eKonf.print(corpus_cfg)
{'_target_': 'ekorpkit.datasets.corpus.Corpus',
 'auto': {'load': True, 'merge': False},
 'features': {'_target_': 'ekorpkit.info.column.CorpusInfo',
                 'columns': {'id': 'id',
                             'merge_meta_on': 'id',
                             'text': 'text',
                             'timestamp': None},
                 'data': {'id': 'int', 'text': 'str'},
                 'datetime': {'columns': None,
                              'format': None,
                              'rcParams': None},
                 'meta': None,
                 'segment_separator': '\\n\\n',
                 'sentence_separator': '\\n',
                 'timestamp': {'format': None, 'key': None, 'rcParams': None}},
 'data_dir': '../data',
 'filetype': None,
 'force': {'build': False},
 'metadata_dir': None,
 'name': 'bok_minutes',
 'path': {'cache': {'cache_dir': '/workspace/.cache',
                    'extract_archive': True,
                    'force_extract': False,
                    'path': None,
                    'return_parent_dir': True,
                    'uri': None,
                    'verbose': False},
          'cached_path': None,
          'columns': None,
          'concat_data': False,
          'data_columns': None,
          'data_dir': '../data',
          'data_file': None,
          'filetype': None,
          'name': 'bok_minutes',
          'output_dir': None,
          'output_file': None,
          'root': '/workspace/data/bok_minutes',
          'suffix': None,
          'verbose': False},
 'use_name_as_subdir': True,
 'verbose': False}
cfg = eKonf.compose("pipeline")
cfg.data.corpus = corpus_cfg
cfg._pipeline_ = ["summary_stats"]
cfg.summary_stats.output_dir = "../data/output"
cfg.summary_stats.output_file = "bok_minutes_stats.yaml"
df = eKonf.instantiate(cfg)
INFO:ekorpkit.datasets.base:Loaded info file: ../data/bok_minutes/info-bok_minutes.yaml
INFO:ekorpkit.io.file:Processing [1] files from ['bok_minutes-train.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['../data/bok_minutes/bok_minutes-train.parquet']
INFO:ekorpkit.io.file:Loading data from ../data/bok_minutes/bok_minutes-train.parquet
INFO:ekorpkit.info.column:index: index, index of data: None, columns: ['id', 'text'], id: ['id']
INFO:ekorpkit.info.column:Adding id [split] to ['id']
INFO:ekorpkit.info.column:Added id [split], now ['id', 'split']
INFO:ekorpkit.info.column:Added a column [split] with value [train]
INFO:ekorpkit.io.file:Processing [1] files from ['meta-bok_minutes-train.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['../data/bok_minutes/meta-bok_minutes-train.parquet']
INFO:ekorpkit.io.file:Loading data from ../data/bok_minutes/meta-bok_minutes-train.parquet
INFO:ekorpkit.info.column:Added a column [split] with value [train]
INFO:ekorpkit.info.column:No timestamp key found
INFO:ekorpkit.io.file:Concatenating 1 dataframes
INFO:ekorpkit.pipelines.pipe:Applying pipeline: OrderedDict([('summary_stats', 'summary_stats')])
INFO:ekorpkit.base:Applying pipe: functools.partial(<function summary_stats at 0x7f7f8a58bf70>)
INFO:ekorpkit.base:Using batcher with minibatch size: 1
INFO:ekorpkit.utils.batch.batcher: backend: joblib  minibatch_size: 1  procs: 230  input_split: False  merge_output: True  len(data): 1 len(args): 5
apply len_bytes to num_bytes: 100%|██████████| 1/1 [00:08<00:00,  8.51s/it]
INFO:ekorpkit.pipelines.pipe:Saving summary stats: ../data/output/bok_minutes_stats.yaml
import os

filepath = os.path.join(cfg.summary_stats.output_dir, cfg.summary_stats.output_file)
eKonf.load(filepath)
{'num_examples': 1, 'num_bytes': 88934, 'num_bytes_median': 88934.0, 'num_bytes_max': 88934, 'num_bytes_min': 88934, 'human_bytes': '86.85 KiB'}

Melt#

Pivot#

data_dir = "../data/fomc"
fomc_sentiments = eKonf.load_data('fomc_sentiments.parquet', data_dir)

cfg = eKonf.compose("pipeline/pivot")
cfg.index = "recent_meeting"
cfg.columns = "content_type"
cfg.values = ["polarity_mean", "polarity_diffusion", "num_examples"]
data = eKonf.pipe(fomc_sentiments, cfg)
data.tail()