Evaluate pretrained embeddings#

%config InlineBackend.figure_format='retina'
import warnings
import logging
from ekorpkit import eKonf

logging.basicConfig(level=logging.INFO)
warnings.filterwarnings('ignore')
print(eKonf.__version__)
0.1.32+1.gcf6615f.dirty
data_dir = "../data/embeddings"
save_dataframe = eKonf.partial(
    config_group="_func_/save_dataframe", output_dir=data_dir
)
load_dataframe = eKonf.partial(config_group="_func_/load_dataframe", data_dir=data_dir)

Load GloVE Vectors#

Wikipedia#

cfg = eKonf.compose(config_group="model/embedding")
cfg.name = "glove_wiki"
cfg.corpus = "Wikipedia"
cfg.cache.uri = "https://nlp.stanford.edu/data/glove.6B.zip"
cfg.model_dir = cfg.cache.path
cfg.model_file = "glove.6B.300d.txt"
cfg.model_type = "glove"
wv_wiki = eKonf.instantiate(cfg)
wv_wiki.load()
INFO:cached_path:cache of https://nlp.stanford.edu/data/glove.6B.zip is up-to-date
INFO:ekorpkit.io.cached_path:cached path: /workspace/.cache/cached_path/d97dcf99e9ed61e3b07a5a87dd5dea7b6a4815ba6eb1ac87f529bfe6e2ba4ccf.f9662ccb99e715467e6da0c85a047cfc51888321958f9440eb5a15f7189c140f-extracted
INFO:cached_path:cache of https://nlp.stanford.edu/data/glove.6B.zip is up-to-date
INFO:ekorpkit.io.cached_path:cached path: /workspace/.cache/cached_path/d97dcf99e9ed61e3b07a5a87dd5dea7b6a4815ba6eb1ac87f529bfe6e2ba4ccf.f9662ccb99e715467e6da0c85a047cfc51888321958f9440eb5a15f7189c140f-extracted
INFO:cached_path:cache of https://raw.githubusercontent.com/RaRe-Technologies/gensim/develop/gensim/test/test_data/questions-words.txt is up-to-date
INFO:ekorpkit.io.cached_path:cached path: /root/.ekorpkit/.cache/cached_path/7568e1428a2ba53a33f67dea7e276f212c065b07d102aafc177a507d244161f6.f60f7f82aaacf3a534d93013617bcfead46a795268096ff1990608b16115b566
INFO:gensim.models.keyedvectors:loading projection weights from /workspace/.cache/cached_path/d97dcf99e9ed61e3b07a5a87dd5dea7b6a4815ba6eb1ac87f529bfe6e2ba4ccf.f9662ccb99e715467e6da0c85a047cfc51888321958f9440eb5a15f7189c140f-extracted/glove.6B.300d.txt
INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (400000, 300) matrix of type float32 from /workspace/.cache/cached_path/d97dcf99e9ed61e3b07a5a87dd5dea7b6a4815ba6eb1ac87f529bfe6e2ba4ccf.f9662ccb99e715467e6da0c85a047cfc51888321958f9440eb5a15f7189c140f-extracted/glove.6B.300d.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2022-06-08T10:14:23.606212', 'gensim': '4.2.0', 'python': '3.8.12 (default, Jan 14 2022, 01:33:56) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.0-58-generic-x86_64-with-glibc2.27', 'event': 'load_word2vec_format'}
wiki_results = wv_wiki.evaluate_word_analogies(analogies="google")
wiki_results.keys()
INFO:gensim.models.keyedvectors:Evaluating word analogies for top 300000 words in the model on /root/.ekorpkit/.cache/cached_path/7568e1428a2ba53a33f67dea7e276f212c065b07d102aafc177a507d244161f6.f60f7f82aaacf3a534d93013617bcfead46a795268096ff1990608b16115b566
INFO:gensim.models.keyedvectors:capital-common-countries: 94.9% (480/506)
INFO:gensim.models.keyedvectors:capital-world: 96.0% (4342/4524)
INFO:gensim.models.keyedvectors:currency: 17.1% (138/808)
INFO:gensim.models.keyedvectors:city-in-state: 59.3% (1463/2467)
INFO:gensim.models.keyedvectors:family: 88.1% (446/506)
INFO:gensim.models.keyedvectors:gram1-adjective-to-adverb: 22.6% (224/992)
INFO:gensim.models.keyedvectors:gram2-opposite: 27.3% (222/812)
INFO:gensim.models.keyedvectors:gram3-comparative: 88.1% (1174/1332)
INFO:gensim.models.keyedvectors:gram4-superlative: 72.2% (810/1122)
INFO:gensim.models.keyedvectors:gram5-present-participle: 70.0% (739/1056)
INFO:gensim.models.keyedvectors:gram6-nationality-adjective: 92.6% (1480/1599)
INFO:gensim.models.keyedvectors:gram7-past-tense: 61.2% (954/1560)
INFO:gensim.models.keyedvectors:gram8-plural: 78.1% (1040/1332)
INFO:gensim.models.keyedvectors:gram9-plural-verbs: 58.5% (509/870)
INFO:gensim.models.keyedvectors:Quadruplets with out-of-vocabulary words: 0.3%
INFO:gensim.models.keyedvectors:NB: analogies containing OOV words were skipped from evaluation! To change this behavior, use "dummy4unknown=True"
INFO:gensim.models.keyedvectors:Total accuracy: 72.0% (14021/19486)
INFO:ekorpkit.models.embeddings.wordvec:Evaluation score: 0.7195422354510931
dict_keys(['score', 'summary', 'correct', 'incorrect'])

Twitter Data#

cfg = eKonf.compose(config_group="model/embedding")
cfg.name = "glove_twitter"
cfg.corpus = "Twitter"
cfg.cache.uri = "https://nlp.stanford.edu/data/glove.twitter.27B.zip"
cfg.model_dir = cfg.cache.path
cfg.model_file = "glove.twitter.27B.200d.txt"
cfg.model_type = "glove"
wv_twt = eKonf.instantiate(cfg)
wv_twt.load()
INFO:cached_path:cache of https://nlp.stanford.edu/data/glove.twitter.27B.zip is up-to-date
INFO:ekorpkit.io.cached_path:cached path: /workspace/.cache/cached_path/d08f2221566196fe3fd6f557a2b67e9ffd5cf4b7c500918d98581a52f349a804.7417464dae8ba25e69e639021d320765d2d18bb24f73391ef423756069dc8078-extracted
INFO:cached_path:cache of https://nlp.stanford.edu/data/glove.twitter.27B.zip is up-to-date
INFO:ekorpkit.io.cached_path:cached path: /workspace/.cache/cached_path/d08f2221566196fe3fd6f557a2b67e9ffd5cf4b7c500918d98581a52f349a804.7417464dae8ba25e69e639021d320765d2d18bb24f73391ef423756069dc8078-extracted
INFO:cached_path:cache of https://raw.githubusercontent.com/RaRe-Technologies/gensim/develop/gensim/test/test_data/questions-words.txt is up-to-date
INFO:ekorpkit.io.cached_path:cached path: /root/.ekorpkit/.cache/cached_path/7568e1428a2ba53a33f67dea7e276f212c065b07d102aafc177a507d244161f6.f60f7f82aaacf3a534d93013617bcfead46a795268096ff1990608b16115b566
INFO:gensim.models.keyedvectors:loading projection weights from /workspace/.cache/cached_path/d08f2221566196fe3fd6f557a2b67e9ffd5cf4b7c500918d98581a52f349a804.7417464dae8ba25e69e639021d320765d2d18bb24f73391ef423756069dc8078-extracted/glove.twitter.27B.200d.txt
INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (1193514, 200) matrix of type float32 from /workspace/.cache/cached_path/d08f2221566196fe3fd6f557a2b67e9ffd5cf4b7c500918d98581a52f349a804.7417464dae8ba25e69e639021d320765d2d18bb24f73391ef423756069dc8078-extracted/glove.twitter.27B.200d.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2022-06-08T09:15:08.452346', 'gensim': '4.2.0', 'python': '3.8.12 (default, Jan 14 2022, 01:33:56) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.0-58-generic-x86_64-with-glibc2.27', 'event': 'load_word2vec_format'}
twt_results = wv_twt.evaluate_word_analogies(analogies="google")
twt_results.keys()
INFO:gensim.models.keyedvectors:Evaluating word analogies for top 300000 words in the model on /root/.ekorpkit/.cache/cached_path/7568e1428a2ba53a33f67dea7e276f212c065b07d102aafc177a507d244161f6.f60f7f82aaacf3a534d93013617bcfead46a795268096ff1990608b16115b566
INFO:gensim.models.keyedvectors:capital-common-countries: 70.6% (357/506)
INFO:gensim.models.keyedvectors:capital-world: 74.6% (1639/2198)
INFO:gensim.models.keyedvectors:currency: 3.3% (15/458)
INFO:gensim.models.keyedvectors:city-in-state: 35.6% (879/2467)
INFO:gensim.models.keyedvectors:family: 79.5% (302/380)
INFO:gensim.models.keyedvectors:gram1-adjective-to-adverb: 12.7% (118/930)
INFO:gensim.models.keyedvectors:gram2-opposite: 30.8% (185/600)
INFO:gensim.models.keyedvectors:gram3-comparative: 74.2% (989/1332)
INFO:gensim.models.keyedvectors:gram4-superlative: 64.0% (718/1122)
INFO:gensim.models.keyedvectors:gram5-present-participle: 66.4% (701/1056)
INFO:gensim.models.keyedvectors:gram6-nationality-adjective: 72.7% (894/1229)
INFO:gensim.models.keyedvectors:gram7-past-tense: 53.1% (787/1482)
INFO:gensim.models.keyedvectors:gram8-plural: 76.0% (1012/1332)
INFO:gensim.models.keyedvectors:gram9-plural-verbs: 52.0% (452/870)
INFO:gensim.models.keyedvectors:Quadruplets with out-of-vocabulary words: 18.3%
INFO:gensim.models.keyedvectors:NB: analogies containing OOV words were skipped from evaluation! To change this behavior, use "dummy4unknown=True"
INFO:gensim.models.keyedvectors:Total accuracy: 56.7% (9048/15962)
INFO:ekorpkit.models.embeddings.wordvec:Evaluation score: 0.5668462598671845
dict_keys(['score', 'summary', 'correct', 'incorrect'])

Common Crawl#

cfg = eKonf.compose(config_group="model/embedding")
cfg.name = "glove_commoncrawl"
cfg.corpus = "Common Crawl"
cfg.cache.uri = "https://nlp.stanford.edu/data/glove.42B.300d.zip"
cfg.model_dir = cfg.cache.path
cfg.model_file = "glove.42B.300d.txt"
cfg.model_type = "glove"
wv_cc = eKonf.instantiate(cfg)
wv_cc.load()
INFO:cached_path:cache of https://nlp.stanford.edu/data/glove.42B.300d.zip is up-to-date
INFO:ekorpkit.io.cached_path:cached path: /workspace/.cache/cached_path/1b77e307d0976680276bd5eddffac55a83164787611dccb81aaaab9c4c79073b.d4304a004cdd3b0267ad688b5fc4d6c9d43c622a571ee2ba8b39604fc257faed-extracted
INFO:cached_path:cache of https://nlp.stanford.edu/data/glove.42B.300d.zip is up-to-date
INFO:ekorpkit.io.cached_path:cached path: /workspace/.cache/cached_path/1b77e307d0976680276bd5eddffac55a83164787611dccb81aaaab9c4c79073b.d4304a004cdd3b0267ad688b5fc4d6c9d43c622a571ee2ba8b39604fc257faed-extracted
INFO:cached_path:cache of https://raw.githubusercontent.com/RaRe-Technologies/gensim/develop/gensim/test/test_data/questions-words.txt is up-to-date
INFO:ekorpkit.io.cached_path:cached path: /root/.ekorpkit/.cache/cached_path/7568e1428a2ba53a33f67dea7e276f212c065b07d102aafc177a507d244161f6.f60f7f82aaacf3a534d93013617bcfead46a795268096ff1990608b16115b566
INFO:gensim.models.keyedvectors:loading projection weights from /workspace/.cache/cached_path/1b77e307d0976680276bd5eddffac55a83164787611dccb81aaaab9c4c79073b.d4304a004cdd3b0267ad688b5fc4d6c9d43c622a571ee2ba8b39604fc257faed-extracted/glove.42B.300d.txt
INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (1917494, 300) matrix of type float32 from /workspace/.cache/cached_path/1b77e307d0976680276bd5eddffac55a83164787611dccb81aaaab9c4c79073b.d4304a004cdd3b0267ad688b5fc4d6c9d43c622a571ee2ba8b39604fc257faed-extracted/glove.42B.300d.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2022-06-08T09:22:21.176032', 'gensim': '4.2.0', 'python': '3.8.12 (default, Jan 14 2022, 01:33:56) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.0-58-generic-x86_64-with-glibc2.27', 'event': 'load_word2vec_format'}
cc_results = wv_cc.evaluate_word_analogies(analogies="google")
cc_results.keys()
INFO:gensim.models.keyedvectors:Evaluating word analogies for top 300000 words in the model on /root/.ekorpkit/.cache/cached_path/7568e1428a2ba53a33f67dea7e276f212c065b07d102aafc177a507d244161f6.f60f7f82aaacf3a534d93013617bcfead46a795268096ff1990608b16115b566
INFO:gensim.models.keyedvectors:capital-common-countries: 95.1% (481/506)
INFO:gensim.models.keyedvectors:capital-world: 94.0% (4178/4446)
INFO:gensim.models.keyedvectors:currency: 17.6% (142/808)
INFO:gensim.models.keyedvectors:city-in-state: 78.1% (1926/2467)
INFO:gensim.models.keyedvectors:family: 90.9% (460/506)
INFO:gensim.models.keyedvectors:gram1-adjective-to-adverb: 30.2% (300/992)
INFO:gensim.models.keyedvectors:gram2-opposite: 35.6% (289/812)
INFO:gensim.models.keyedvectors:gram3-comparative: 85.6% (1140/1332)
INFO:gensim.models.keyedvectors:gram4-superlative: 84.0% (942/1122)
INFO:gensim.models.keyedvectors:gram5-present-participle: 80.9% (854/1056)
INFO:gensim.models.keyedvectors:gram6-nationality-adjective: 88.3% (1412/1599)
INFO:gensim.models.keyedvectors:gram7-past-tense: 49.3% (769/1560)
INFO:gensim.models.keyedvectors:gram8-plural: 84.9% (1131/1332)
INFO:gensim.models.keyedvectors:gram9-plural-verbs: 64.1% (558/870)
INFO:gensim.models.keyedvectors:Quadruplets with out-of-vocabulary words: 0.7%
INFO:gensim.models.keyedvectors:NB: analogies containing OOV words were skipped from evaluation! To change this behavior, use "dummy4unknown=True"
INFO:gensim.models.keyedvectors:Total accuracy: 75.1% (14582/19408)
INFO:ekorpkit.models.embeddings.wordvec:Evaluation score: 0.7513396537510305
dict_keys(['score', 'summary', 'correct', 'incorrect'])

Evaluate embeddings#

import pandas as pd

eval_summary = pd.concat(
    [wiki_results["summary"], twt_results["summary"], cc_results["summary"]]
)
save_dataframe(eval_summary, output_file="eval_summary.parquet")
INFO:ekorpkit.io.file:Saving dataframe as ../data/embeddings/eval_summary.parquet
eval_summary = load_dataframe(data_file="eval_summary.parquet")
eval_summary.head()
INFO:ekorpkit.io.file:Loading data from ../data/embeddings/eval_summary.parquet
Category Correct Incorrect Samples Average Corpus
0 Capitals 480 26 506 0.948617 Wikipedia
1 Capitals RoW 4342 182 4524 0.959770 Wikipedia
2 Currency 138 670 808 0.170792 Wikipedia
3 City-State 1463 1004 2467 0.593028 Wikipedia
4 Famliy 446 60 506 0.881423 Wikipedia
cfg = eKonf.compose(config_group="visualize/plot=barplot")
cfg.plots[0].x = "Category"
cfg.plots[0].y = "Average"
cfg.plots[0].hue = "Corpus"
cfg.ax.ytickmajorformatterfunc = "lambda y, _: '{:.0%}'.format(y)"
cfg.ax.ylabel = "Accuracy"
cfg.figure.figsize = (16, 5)
cfg.figure.fontsize = 10
cfg.ax.title = f"Word Vector Accuracy by Glove Source: Twitter: {twt_results['score']:.2%}, Wiki: {wiki_results['score']:.2%}, Crawl: {cc_results['score']:.2%}"
eKonf.instantiate(cfg, data=eval_summary)
INFO:ekorpkit.visualize.plot:Plotting barplot with {'x': 'Category', 'y': 'Average', 'hue': 'Corpus'}
INFO:ekorpkit.visualize.plot:Saved figure to ./figs/plot_BarPlot.png
../../../_images/d18ca3d98c465059f54f7573de8207b2384a384eaa9ad39af778225036eb12e1.png

Visualize Embeddings#

results = wv_wiki.reduce_embeddings_2d(restrict_vocab=100_000)
vectors = results["vectors"]
word2idx = results["word2idx"]
INFO:ekorpkit.models.embeddings.wordvec:dimensions: (100000, 300)
INFO:ekorpkit.models.embeddings.wordvec:explained variance: [0.02604632 0.01293811]
best_analogies = wv_wiki.find_most_similar_analogies(
    wiki_results["correct"], word2idx, vectors
)
best_analogies
wordpairs word2idx similarity
category
Adj-Adverb (fortunate, fortunately, lucky, luckily) (11156, 11584, 5065, 19955) 1.998511
Capitals (london, england, paris, france) (516, 563, 1035, 387) 1.780721
Capitals RoW (vienna, austria, brussels, belgium) (4094, 2640, 3879, 2975) 1.996072
City-State (chicago, illinois, omaha, nebraska) (1147, 2884, 12159, 6087) 1.971419
Comparative (long, longer, heavy, heavier) (173, 1078, 1106, 11613) 1.932268
Currency (usa, dollar, russia, ruble) (2396, 678, 412, 17506) 1.834880
Famliy (sons, daughters, stepfather, stepmother) (2912, 4321, 20624, 26903) 1.999805
Nationality (switzerland, swiss, israel, israeli) (2311, 1849, 315, 406) 1.999926
Opposite (tasteful, distasteful, likely, unlikely) (43255, 41259, 647, 2993) 1.954353
Past Tense (sitting, sat, hitting, hit) (2995, 3223, 3141, 416) 1.999999
Plural (eye, eyes, donkey, donkeys) (2090, 2251, 20328, 35193) 1.999876
Plural Verbs (talk, talks, estimate, estimates) (1077, 370, 3470, 2886) 1.739114
Pres. Part. (write, writing, read, reading) (2432, 1649, 1465, 2185) 1.999959
Superlative (weak, weakest, big, biggest) (2690, 15655, 365, 882) 0.663122
Total accuracy (sitting, sat, hitting, hit) (2995, 3223, 3141, 416) 1.999999

Plot Analogy Examples#

wv_wiki.plot_similar_analogies(best_analogies, vectors, ncols=3, figsize=(15, 15))
INFO:ekorpkit.visualize.plot:No data to plot
INFO:ekorpkit.visualize.plot:No plots to plot
INFO:ekorpkit.visualize.plot:Saved figure to ./figs/plot_plot.png
../../../_images/78fd5bd8e6a2299238d54bc41ee4d279c42f4823fd8058d688a59a8fea9452f7.png