LM Dictionary vs. finbert vs. T5#

Prepare financial_phrasebank dataset#

ds_name = "financial_phrasebank"
cfg = eKonf.compose('dataset/simple=' + ds_name)
cfg.data_dir = '../data/' + ds_name
cfg.io.force.build = True
cfg.io.force.summarize = True
db = eKonf.instantiate(cfg)
ds_cfg = eKonf.compose('dataset')
ds_cfg.name = 'financial_phrasebank'
ds_cfg.path.cache.uri = 'https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/financial_phrasebank.zip'
ds_cfg.data_dir = ds_cfg.path.cached_path
ds_cfg.verbose = False
ds = eKonf.instantiate(ds_cfg)
Dataset : financial_phrasebank

Instantiating a sentiment analyser class with financial_phrasebank dataset#

model_cfg = eKonf.compose('model/sentiment=lm')

cfg = eKonf.compose(config_group='pipeline')
cfg.verbose = False
cfg.data.dataset = ds_cfg
cfg._pipeline_ = ['predict']
cfg.predict.model = model_cfg
cfg.predict.output_dir = "../data/predict"
cfg.predict.output_file = f'{ds_cfg.name}.parquet'
cfg.num_workers = 1
df = eKonf.instantiate(cfg)
tmp_df = eKonf.load_data("financial_phrasebank-dev.parquet", cfg.predict.output_dir)
eval_cfg = eKonf.compose('model/eval=classification')
eval_cfg.columns.actual = 'labels'
eval_cfg.columns.predicted = 'polarity_label'
eval_cfg.labels = ['positive','neutral','negative']
eval_cfg.data_dir = '../data/predict'
eval_cfg.data_file = 'financial_phrasebank-*.parquet'
eval_cfg.output_dir = '../data/eval'
Accuracy:  0.6421895861148198
Precison:  0.6333030880769485
Recall:  0.6421895861148198
F1 Score:  0.5974587783784485
Model Report: 
              precision    recall  f1-score   support

    negative       0.39      0.36      0.37       302
     neutral       0.68      0.89      0.77      1375
    positive       0.65      0.19      0.29       570

    accuracy                           0.64      2247
   macro avg       0.57      0.48      0.48      2247
weighted avg       0.63      0.64      0.60      2247

Instantiating a transformer classficiation model with financial_phrasebank dataset#

model_cfg = eKonf.compose('model/transformer=classification', overrides)
model_cfg.dataset = ds_cfg
model_cfg.verbose = False
model_cfg.config.num_train_epochs = 2
model_cfg.config.max_seq_length = 256
model_cfg.config.train_batch_size = 32
model_cfg.config.eval_batch_size = 32
model_cfg.labels = ['positive','neutral','negative']
model_cfg._method_ = ['train']
model_cfg = eKonf.compose('model/transformer=classification', overrides)
model_cfg.dataset = ds_cfg
model_cfg.verbose = False
model_cfg.config.num_train_epochs = 2
model_cfg.config.max_seq_length = 256
model_cfg.config.train_batch_size = 32
model_cfg.config.eval_batch_size = 32
model_cfg.labels = ['positive','neutral','negative']
model_cfg._method_ = ['eval']
Accuracy:  0.9004424778761062
Precison:  0.9067742606459421
Recall:  0.9004424778761062
F1 Score:  0.90187372379709
Model Report: 
              precision    recall  f1-score   support

    negative       0.94      0.80      0.87        61
     neutral       0.96      0.93      0.94       277
    positive       0.77      0.88      0.82       114

    accuracy                           0.90       452
   macro avg       0.89      0.87      0.88       452
weighted avg       0.91      0.90      0.90       452
<ekorpkit.models.transformer.simple.SimpleClassification at 0x7f78034b6670>

Instantiating a T5 classficiation model with financial_phrasebank dataset#

model_cfg = eKonf.compose('model/transformer=t5_classification_with_simple', overrides)
model_cfg.dataset = ds_cfg
model_cfg.verbose = False
model_cfg.config.num_train_epochs = 2
model_cfg.config.max_seq_length = 256
model_cfg.config.train_batch_size = 8
model_cfg.config.eval_batch_size = 8
model_cfg.labels = ['positive','neutral','negative']
model_cfg._method_ = ['train', 'eval']
# model_cfg._method_ = ['eval']
{'eval_loss': 0.07960319360503681}
Accuracy:  0.9402654867256637
Precison:  0.9406832105947149
Recall:  0.9402654867256637
F1 Score:  0.9395622196129697
Model Report: 
              precision    recall  f1-score   support

    negative       0.96      0.82      0.88        61
     neutral       0.94      0.97      0.96       277
    positive       0.93      0.93      0.93       114

    accuracy                           0.94       452
   macro avg       0.94      0.91      0.92       452
weighted avg       0.94      0.94      0.94       452
<ekorpkit.models.transformer.simple_t5.SimpleT5 at 0x7f7802a775e0>