Source code for source.task_eval

# coding: utf-8
import sys, os
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
from scipy.stats import spearmanr
from nltk.corpus import wordnet as wn
import spacy
from tqdm import tqdm
import json
import argh
from argh import arg
import math
import random
from typing import List, Tuple

import matplotlib
matplotlib.rcParams["savefig.dpi"] = 300
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
matplotlib.style.use('fivethirtyeight')

from itertools import combinations, product, chain
from tabulate import tabulate, LATEX_ESCAPE_RULES
from copy import deepcopy
from collections import defaultdict
import warnings

from source.process_embeddings import mid_fusion, MM_TOKEN, Embeddings
import source.utils as utils
from source.utils import get_vec, pfont, PrintFont, LaTeXFont, latex_table_post_process, latex_table_wrapper, \
                         dict2struct_array, tuple_list

sys.path.append('../2v2_software_privatefork/')
import two_vs_two

MISSING = -2  # Signify word pairs which aren't covered by and embedding's vocabulary
ROUND = 2  # Round scores in print
NAME_DELIM = ' | '
linewidth = 3


# We could use dataclass decorator in case of python 3.7
[docs]class DataSets:
    """Class for storing evaluation datasets and linguistic embeddings."""
    # Evaluation datasets
    men: List[Tuple[str, str, float]]
    simlex: List[Tuple[str, str, float]]
    # simverb: List[Tuple[str, str, float]]
    fmri_vocab = ['airplane', 'ant', 'apartment', 'arch', 'arm', 'barn', 'bear', 'bed', 'bee', 'beetle',
                  'bell', 'bicycle', 'bottle', 'butterfly', 'car', 'carrot', 'cat', 'celery', 'chair',
                  'chimney', 'chisel', 'church', 'closet', 'coat', 'corn', 'cow', 'cup', 'desk', 'dog',
                  'door', 'dress', 'dresser', 'eye', 'fly', 'foot', 'glass', 'hammer', 'hand', 'horse',
                  'house', 'igloo', 'key', 'knife', 'leg', 'lettuce', 'pants', 'pliers', 'refrigerator',
                  'saw', 'screwdriver', 'shirt', 'skirt', 'spoon', 'table', 'telephone', 'tomato', 'train',
                  'truck', 'watch', 'window']
    datasets = {}
    normalizers = {}

    def __init__(self, datadir: str):
        # SIMVERB = datadir + '/simverb-3500-data'
        # simverb_full = list(csv.reader(open(SIMVERB + '/SimVerb-3500.txt'), delimiter='\t'))
        # self.simverb = list(map(lambda x: [x[0], x[1], x[3]], simverb_full))
        self.men = json.load(open(datadir + '/men.json'))
        self.simlex = json.load(open(datadir + '/simlex.json'))
        self.datasets = {'MEN': self.men, 'SimLex': self.simlex}  # , 'SimVerb': self.simverb}
        self.normalizers = {'MEN': 50, 'SimLex': 10}  # , 'SimVerb': 10}
        self.pair_num = {'MEN': 3000, 'SimLex': 999}


[docs]def dataset_vocab(dataset: str) -> list:
    pairs = list(zip(*dataset))[:2]
    return list(set(pairs[0] + pairs[1]))


[docs]def compute_dists(vecs):
    dists = cosine_distances(vecs, vecs)
    return dists


[docs]def neighbors(words, vocab, vecs, n=10):
    dists = compute_dists(vecs)
    if type(words[0]) == str:
        indices = [np.where(vocab == w)[0][0] for w in words]
    else:
        indices = words
    for i in indices:
        print(vocab[i], vocab[np.argsort(dists[i])[:n]])


[docs]def covered(dataset, vocab):
    return list(filter(lambda s: s[0] in vocab and s[1] in vocab, dataset))


[docs]def coverage(vocabulary, data):
    nlp = spacy.load('en')
    vvocab_lemma = [[t for t in nlp(str(w))][0].lemma_ for w in vocabulary]
    vocab = set(list(vocabulary) + vvocab_lemma)

    print('Vocab size:', len(vocabulary))
    print('Vocab size with lemmas:', len(vocab))

    # Semantic similarity/relatedness datasets
    for name, dataset in {'MEN': data.men, 'SimLex': data.simlex}.items():
        coverage = len(covered(dataset, vocabulary))
        coverage_lemma = len(covered(dataset, vocab))
        print(f'{name} pair coverage:',
              coverage_lemma, f'({round(100 * coverage_lemma / len(dataset))}%)')
        print(f'{name} pair coverage without lemmas:',
              coverage, f'({round(100 * coverage / len(dataset))}%)')

    # Mitchell (2008) fMRI data: 60 nouns
    coverage = len(list(set(vocabulary).intersection(set(data.fmri_vocab))))
    coverage_lemma = len(list(set(vocab).intersection(set(data.fmri_vocab))))
    print(f'fMRI coverage:', coverage_lemma, f'({round(100 * coverage_lemma / len(data.fmri_vocab))}%)')
    print(f'fMRI coverage without lemmas:', coverage, f'({round(100 * coverage / len(data.fmri_vocab))}%)')


[docs]def divide_eval_vocab_by_freqranges(distribution_file, eval_data_dir, dataset_name, num_groups=3, save=False):
    with open(distribution_file, 'r') as f:
        dist = json.load(f)
    ds = DataSets(eval_data_dir)
    evalv = list(set(dataset_vocab(ds.datasets[dataset_name])))

    print(f'Filter distribution file with {dataset_name} vocab')
    eval_dist = dict((w, c) for w, c in tqdm(dist.items()) if w in evalv) # Filter distribution vocab by eval vocab

    # Print and save logs
    log = f'#{dataset_name} vocab: {len(evalv)}\n'
    log += f'#Filtered Wiki vocab: {len(eval_dist)}\n'
    if len(eval_dist) < len(evalv):
       log += f'Missing words in Wiki: {list(set(evalv) - set(eval_dist.keys()))}'
    print(log)
    with open(f'{os.path.splitext(distribution_file)[0]}_{dataset_name}_split{num_groups}.log', 'w') as f:
        f.write(log)

    sorted_dist = sorted(eval_dist.items(), key=lambda item: item[1])     # sort words by frequency
    N = len(evalv)
    swords, scounts = zip(*sorted_dist)
    group_size = math.ceil(N / num_groups)
    fqvocabs = []
    for i in range(0, N, group_size):
        fmin = scounts[i]
        if N > i + group_size - 1:
            fmax = scounts[i + group_size - 1]
        else:
            fmax = scounts[-1]
        fqvocabs.append((f'{fmin} {fmax}', swords[i:i+group_size]))
    if save:    # Save vocabs for freq ranges
        with open(f'{os.path.splitext(distribution_file)[0]}_{dataset_name}_split{num_groups}.json', 'w') as f:
            json.dump(fqvocabs, f)
    return fqvocabs


[docs]def compute_correlations(scores: (np.ndarray, list), name_pairs: List[Tuple[str, str]] = None,
                         common_subset: bool = False, leave_out=False):
    """Compute correlation between score series.
        :param scores: Structured array of scores with embedding/ground_truth names.
        :param name_pairs: pairs of scores to correlate. If None, every pair will be computed.
                          if 'gt', everything will be plot against the ground_truth.
       :param leave_out: Leave out 1/leave_out portion of pairs, chosen randomly. Does not leave out if it is False.
    """
    if name_pairs == 'gt':
        name_pairs = [('ground_truth', nm) for nm in scores[0].dtype.names
                      if nm != 'ground_truth']
    elif name_pairs == 'all':
        name_pairs = None
    if not name_pairs:  # Correlations for all combinations of 2
        name_pairs = list(combinations(scores.dtype.names, 2))

    if common_subset:  # Filter rows where any of the scores are missing for a word pair
        ids = set(range(scores.shape[0]))
        for n in scores.dtype.names:
            ids = ids.intersection(set(np.where(scores[n] != MISSING)[0]))
        scs = np.array(np.empty(len(ids)), dtype=scores.dtype)
        for n in scores.dtype.names:
            scs[n] = scores[n][list(ids)]
    else:
        scs = scores

    correlations = {}
    for nm1, nm2 in name_pairs:
        # Filter pairs which the scores, coming from any of the two embeddings, don't cover
        if (scs[nm1] == MISSING).all():
            warnings.warn(f'{nm1} has 0 coverage.')
            correlations[' | '.join([nm1, nm2])] = (0, 0, 0)
        elif (scs[nm2] == MISSING).all():
            warnings.warn(f'{nm2} has 0 coverage.')
            correlations[' | '.join([nm1, nm2])] = (0, 0, 0)
        else:
            scores1, scores2 = zip(*[(s1, s2) for s1, s2 in
                                     zip(scs[nm1], scs[nm2]) if s1 != MISSING and s2 != MISSING])
            assert len(scores1) == len(scores2)
            if leave_out:
                lp = len(scores1)
                keep = 1 - 1 / leave_out
                idx = list(range(lp))
                random.shuffle(idx)
                idx = idx[:int(lp * keep)]
                scores1 = [s for i, s in enumerate(scores1) if i in idx]
                scores2 = [s for i, s in enumerate(scores2) if i in idx]
            corr = spearmanr(scores1, scores2)
            correlations[' | '.join([nm1, nm2])] = (corr.correlation, corr.pvalue, len(scores1))

    return correlations


[docs]def highlight(val, conditions: dict, tablefmt):
    """Highlight value in a table column.
    :param val: number, value
    :param conditions: dict of {colour: condition}
    :param tablefmt: 'simple' is terminal, 'latex-raw' is LaTeX
    """
    val = round(val, ROUND)
    for color, cond in conditions.items():
        if tablefmt == 'simple':
            if cond:
                return pfont([color, 'BOLD'], format(round(val, ROUND), f".{ROUND}f"), PrintFont)
        elif tablefmt in ['latex', 'latex_raw']:  # needs to be amended by hand
            if cond:
                return pfont([color, 'BOLD'], str(format(round(val, ROUND), f".{ROUND}f")), LaTeXFont)
    return format(val, f".{ROUND}f")


[docs]def mm_over_uni(name, score_dict):
    nam = deepcopy(name)
    if NAME_DELIM in nam:  # SemSim scores
        prefix, vname = nam.split(NAME_DELIM)
        prefix = prefix + NAME_DELIM
    else:  # Brain scores
        vname = nam
        prefix = ''
    if MM_TOKEN in vname:
        nm1, nm2 = vname.split(MM_TOKEN)
        get_score = lambda x: round(x, ROUND) if isinstance(x, float) else round(x[0], ROUND)
        return get_score(score_dict[name]) > get_score(score_dict[prefix + nm1]) and \
               get_score(score_dict[name]) > get_score(score_dict[prefix + nm2])
    return False


[docs]def latex_escape(string):
    return ''.join([LATEX_ESCAPE_RULES.get(c, c) for c in string])


[docs]def print_correlations(scores: np.ndarray, name_pairs='gt',
                       common_subset: bool = False, tablefmt: str = "simple", caption='', label=''):
    correlations = compute_correlations(scores, name_pairs, common_subset=common_subset)
    maxcorr = max(list(zip(*correlations.values()))[0])

    def mm_o_uni(name):
        return mm_over_uni(name, correlations)

    if 'latex' in tablefmt:
        escape = latex_escape
        font = LaTeXFont
    else:
        escape = lambda x: x
        font = PrintFont
    table = tabulate([(pfont(['ITALIC'], escape(Embeddings.get_label(nm)), font),
                       highlight(corr, {'red': corr == maxcorr, 'blue': mm_o_uni(nm)}, tablefmt),
                       format(pvalue, f".{ROUND}f"),
                       length)
                      for nm, (corr, pvalue, length) in correlations.items()],
                     headers=[pfont(['BOLD'], x, font) for x in
                              ['Name pairs', 'Spearman', 'P-value', 'Coverage']],
                     tablefmt=tablefmt)
    if 'latex' in tablefmt:
        table = latex_table_post_process(table, [3, 9], caption, label=label)

    print(table)


[docs]def print_subsampled_correlations(scores: np.ndarray, name_pairs='gt',
                       common_subset: bool = False, tablefmt: str = "simple", caption='', label='', n_sample=3):
    correlation_smpl = {}
    for i in range(n_sample):
        correlations = compute_correlations(scores, name_pairs, common_subset=common_subset, leave_out=3)
        if i == 0:
            for k, (corr, pval, len) in correlations.items():
                correlation_smpl[k] = ([corr], [pval], [len])
        else:
            for k, (corrs, pvals, lens) in correlation_smpl.items():
                corr, pval, len = correlations[k]
                correlation_smpl[k] = (corrs + [corr], pvals + [pval], lens + [len])
    maxcorr = max(list(zip(*correlations.values()))[0])

    # def mm_o_uni(name):
    #     return mm_over_uni(name, correlations)

    if 'latex' in tablefmt:
        escape = latex_escape
        font = LaTeXFont
    else:
        escape = lambda x: x
        font = PrintFont
    table = tabulate([(pfont(['ITALIC'], escape(Embeddings.get_label(nm)), font),
                       f'{format(round(np.mean(corrs), ROUND), f".{ROUND}f")} ({format(round(np.std(corrs), ROUND), f".{ROUND}f")})',
                       f'{format(round(np.mean(pvals), ROUND), f".{ROUND}f")} ({format(round(np.std(pvals), ROUND), f".{ROUND}f")})',
                       lens[0])
                      for nm, (corrs, pvals, lens) in correlation_smpl.items()],
                     headers=[pfont(['BOLD'], x, font) for x in
                              ['Name pairs', 'Spearman', 'P-value', 'Coverage']],
                     tablefmt=tablefmt)
    if 'latex' in tablefmt:
        table = latex_table_post_process(table, [3, 9], caption, label=label)

    print(table)


[docs]def print_brain_scores(brain_scores, tablefmt: str = "simple", caption='', suffix='', label=''):
    # Map for printable labels
    labels = dict((Embeddings.get_label(name), name) for name in brain_scores.keys())
    lingvnames = list(set(list(Embeddings.fasttext_vss.keys())).intersection(set(labels.keys())))
    VGvnames = [n for n in labels.keys() if n == 'VG SceneGraph']
    mm_vis_names = [n for n in labels.keys() if 'MM' in n or MM_TOKEN in n and 'VG SceneGraph' not in n]
    mm_VG_names = [n for n in labels.keys() if MM_TOKEN in n and 'VG SceneGraph' in n]
    visvnames = set(labels.keys()).difference(set(lingvnames + VGvnames + mm_VG_names + mm_vis_names))
    brain_scores_ordered = []
    # Group E_L, E_V, E_S, E_L + E_V, E_L + E_S vecs
    for n in lingvnames:
        brain_scores_ordered.append((n, brain_scores[labels[n]]))
    for n in visvnames:
        brain_scores_ordered.append((n, brain_scores[labels[n]]))
    for n in VGvnames:
        brain_scores_ordered.append((n, brain_scores[labels[n]]))
    for n in mm_vis_names:
        brain_scores_ordered.append((n, brain_scores[labels[n]]))
    for n in mm_VG_names:
        brain_scores_ordered.append((n, brain_scores[labels[n]]))

    vals = list(zip(*[v.values() for v in brain_scores.values()]))

    def print_data(data):
        # Print for individual participants and average 'fMRI' or 'MEG' scores
        v_avg = {'fMRI': 2, 'MEG': 3}[data]
        v_scores = {'fMRI': 0, 'MEG': 1}[data]
        max_avg = max(vals[v_avg])
        maxes = [max(x) for x in zip(*vals[v_scores])]
        part_num = len(list(brain_scores.values())[v_scores][data])  # number of participants
        # Scores per participant
        score_dicts = [dict((k, v[data][p]) for k, v in brain_scores_ordered) for p in range(part_num)]
        avg_dict = dict((k, v[f'{data} Avg']) for k, v in brain_scores_ordered)

        # print(f'\n-------- {data} --------\n')
        if 'latex' in tablefmt:
            escape = latex_escape
            font = LaTeXFont
        else:
            escape = lambda x: x
            font = PrintFont

        table = tabulate([[pfont(['ITALIC'], escape(name), font)] +
                          [highlight(c, {'red': c == max_scr, 'blue': mm_over_uni(name, scr_dict)}, tablefmt)
                           for c, max_scr, scr_dict in zip(v[data], maxes, score_dicts)] +
                          [highlight(v[f'{data} Avg'],
                                     {'red': v[f'{data} Avg'] == max_avg, 'blue': mm_over_uni(name, avg_dict)},
                                     tablefmt)] +
                          [round(np.std(v[data]), ROUND), v['length']]
                          for name, v in brain_scores_ordered],
                         headers=[pfont(['BOLD'], x, font) for x in
                                  ['Embedding'] +
                                  [f'P{i + 1}' for i in range(part_num)] +
                                  ['Avg', 'STD', 'Coverage']],
                         tablefmt=tablefmt)

        # Table of scores averaged for participants over all models per modality
        ling_avg_P = [np.mean([v for k, v in dictP.items() if k in lingvnames]) for dictP in score_dicts]
        vis_avg_P = [np.mean([v for k, v in dictP.items() if k in visvnames]) for dictP in score_dicts]
        VG_avg_P = [np.mean([v for k, v in dictP.items() if k in VGvnames]) for dictP in score_dicts]
        mm_vis_avg_P = [np.mean([v for k, v in dictP.items() if k in mm_vis_names]) for dictP in score_dicts]
        mm_VG_avg_P = [np.mean([v for k, v in dictP.items() if k in mm_VG_names]) for dictP in score_dicts]
        maxes = [max(p) for p in zip(ling_avg_P, vis_avg_P, VG_avg_P, mm_vis_avg_P, mm_VG_avg_P)]

        table_P = tabulate([[pfont(['ITALIC'], mod, font)] +
                            [highlight(x, {'BOLD': x == mx}, tablefmt) for x, mx in zip(avgPs, maxes)]
                            for mod, avgPs in [('$E_L$', ling_avg_P),
                                               ('$E_V$', vis_avg_P),
                                               ('$E_S$', VG_avg_P),
                                               ('$E_L + E_V$', mm_vis_avg_P),
                                               ('$E_L + E_S$', mm_VG_avg_P)]],
                           headers=[pfont(['BOLD'], x, font) for x in
                                    ['Modality'] + [f'P{i + 1}' for i in range(part_num)]],
                           tablefmt=tablefmt)
        if 'latex' in tablefmt:
            table = latex_table_post_process(table, [3, 9],
                                             f'{data} scores for each participant and embedding' + suffix + caption,
                                             fit_to_page=True, label=data + '_' + label)
            table_P = latex_table_post_process(table_P, [],
                                               f'{data} scores averaged over each modality' + suffix + \
                                               ' Bold signifies the highest average performance for each participant.',
                                               fit_to_page=True, label='_'.join([data, label, 'participants']))  # latex_table_wrapper(table_P)
        print(table, '\n')
        print(table_P)

    print_data('fMRI')
    print_data('MEG')


[docs]class PlotColour:

[docs]    @staticmethod
    def colour_by_modality(labels):
        colours = []
        linestyles = []
        alphas = []
        for l in labels:
            lst = '-'
            al = 0.8
            if MM_TOKEN in l or 'MM' in l:
                if 'VG SceneGraph' in l:
                    colours.append('#ff3e96')
                    al = 0.5
                else:
                    colours.append('#e6b830')
                    lst = ':'
                    al = 0.5
            elif l in ['wikinews', 'wikinews_sub', 'crawl', 'crawl_sub', 'w2v13']:
                colours.append('green')
            elif 'VG-' in l:
                colours.append('purple')
            elif 'VG SceneGraph' == l:
                colours.append('cyan')
            else:
                colours.append('red')
            linestyles.append(lst)
            alphas.append(al)

        return colours, linestyles, alphas

[docs]    @staticmethod
    def get_legend():
        linewidth = 3
        legends = [Line2D([0], [0], color='blue', lw=linewidth),
                   Line2D([0], [0], color='#ff3e96', lw=linewidth),
                   Line2D([0], [0], color='#e6b830', lw=linewidth),
                   Line2D([0], [0], color='green', lw=linewidth),
                   Line2D([0], [0], color='purple', lw=linewidth),
                   Line2D([0], [0], color='cyan', lw=linewidth),
                   Line2D([0], [0], color='red', lw=linewidth)]
        leglabels = ['WordNet concreteness',
                     '$E_L + E_S$',
                     '$E_L + E_V$',
                     '$E_L$',
                     '$E_{VG}$',
                     '$E_S$',
                     '$E_{Google}$']
        return legends, leglabels


[docs]def plot_brain_words(brain_scores, plot_order):
    """Plot hit counts for word in Brain data.
    :param brain_scores: brain score dict
    :param plot_order: 'concreteness' orders words for Wordnet conreteness
                       <emb_name> orders plot for an embedding's scores
    """
    vals = list(zip(*[v.values() for v in brain_scores.values()]))
    labels = Embeddings.get_labels(brain_scores.keys())

    def plot_data(ax, data, ord_vocab, ord_name):
        dscores = {'fMRI': 5, 'MEG': 6}[data]
        word_vals = vals[dscores]
        scores = {}
        wordlists = {}
        for label, val in zip(labels, word_vals):  # embeddings
            word_dict = {}  # Init dictionary with Brain vocab so we have vectors with same length to plot
            for w in ord_vocab:
                word_dict[w] = 0
            for p in val:  # participants
                for word_pair in p:  # word pairs
                    word_dict[word_pair['word1']] += word_pair['hit']
                    word_dict[word_pair['word2']] += word_pair['hit']
            # word_dict = dict(((w, word_dict[w]) for w in ord_vocab))  # Sort by concreteness
            scores[label] = list(word_dict.values())
            wordlists[label] = list(word_dict.keys())

        # Convert to structured array
        score_arrays = dict2struct_array(scores)

        tsuffix = ord_name + ' synset score'
        # Sort by ord_name Embedding
        if ord_name != 'Median' and ord_name != 'Most concrete':
            if ord_name not in labels:
                ord_name = Embeddings.get_label(ord_name)
            tsuffix = 'ordered by ' + ord_name
            score_arrays = np.sort(score_arrays, order=ord_name)
            ord_vocab = [w for w, s in sorted(zip(wordlists[ord_name], scores[ord_name]), key=lambda x: x[1])]

        colours, linestyles, alphas = PlotColour.colour_by_modality(labels)
        # allhits = sum([hits for hits in scores.values()], [])
        # ax.set_yticklabels([i for i in range(min(allhits), max(allhits), 6)], rotation=90)
        plot_scores(score_arrays,
                    vecs_names=labels,
                    labels=None,
                    colours=colours,
                    linestyles=linestyles,
                    title=f'{data} - {tsuffix}',
                    alphas=alphas,
                    xtick_labels=ord_vocab,
                    ax=ax,
                    show=False,
                    type='scatter',
                    swapaxes=True)
        ax.set_xlabel('Hit number')
        ax.yaxis.set_ticks(range(len(ord_vocab)))
        ax.set_yticklabels(ord_vocab, fontsize=14)

    if plot_order == 'concreteness':
        # Order by word concreteness
        word_concs = [[w] + list(wn_concreteness(w)) for w in DataSets.fmri_vocab]
        ord_med_vocab = [w for w, cme, cma in sorted(word_concs, key=lambda x: x[1])]
        ord_max_vocab = [w for w, cme, cma in sorted(word_concs, key=lambda x: x[2])]

        axs = [i for i in range(4)]
        fig, ((axs[0], axs[1]), (axs[2], axs[3])) = plt.subplots(2, 2, figsize=(20, 15))
        plot_data(axs[0], 'fMRI', ord_med_vocab, 'Median')
        plot_data(axs[1], 'MEG', ord_med_vocab, 'Median')
        plot_data(axs[2], 'fMRI', ord_max_vocab, 'Most concrete')
        plot_data(axs[3], 'MEG', ord_max_vocab, 'Most concrete')
    else:
        axs = [i for i in range(2)]
        fig, ((axs[0], axs[1])) = plt.subplots(1, 2, figsize=(20, 13))
        plot_data(axs[0], 'fMRI', DataSets.fmri_vocab, plot_order)
        plot_data(axs[1], 'MEG', DataSets.fmri_vocab, plot_order)

    legs, leglabels = PlotColour.get_legend()   # Leave out WordNet concreteness [1:]
    fig.legend(legs[1:], leglabels[1:], loc=9, edgecolor='inherit', ncol=7, borderaxespad=-0.2, numpoints=1, fontsize=16)
    fig.tight_layout(pad=1.0)

    return fig


[docs]def eval_dataset(dataset: List[Tuple[str, str, float]],
                 dataset_name: str,
                 embeddings: List[np.ndarray],
                 vocabs: List[List[str]],
                 labels: List[str]) -> (np.ndarray, list):
    scores = np.array(np.empty(len(dataset)),
                      dtype=[('ground_truth', np.ndarray)] +
                            [(label, np.ndarray) for label in labels])
    pairs = []
    print(f'Evaluate on {dataset_name}')
    for i, (w1, w2, score) in enumerate(tqdm(dataset)):
        scores['ground_truth'][i] = float(score)
        for emb, vocab, label in zip(embeddings, vocabs, labels):
            try:
                scores[label][i] = cosine_similarity(get_vec(w1, emb, vocab), get_vec(w2, emb, vocab))[0][0]
            except IndexError:
                scores[label][i] = MISSING
            if (scores[label] == MISSING).all():
                print(f'Warning: No word pairs were found in {label} for {dataset_name}!')
        pairs.append((w1, w2))

    return scores, pairs


[docs]def plot_scores(scores: np.ndarray, gt_divisor=10, vecs_names=None, labels=None, colours=None, linestyles=None,
                title=None, type='plot', alphas=None, xtick_labels=None, ax=None, show=True, swapaxes=False):
    """Scatter plot of a structured array."""
    scs = deepcopy(scores)
    if 'ground_truth' in scores.dtype.names:
        scs['ground_truth'] /= gt_divisor

    if vecs_names is None:
        vecs_names = scs.dtype.names
    if labels is None:
        labs = [None for i in range(len(vecs_names))]
    else:
        labs = labels
    if colours is None:
        colours = [None for i in range(len(vecs_names))]
    if linestyles is None:
        linestyles = [None for i in range(len(vecs_names))]

    for nm, c, l, ls, al in zip(vecs_names, colours, labs, linestyles, alphas):
        mask = scs[nm] > MISSING  # Leave out the pairs which aren't covered
        x = np.arange(scs[nm].shape[0])[mask]
        y = scs[nm][mask]
        if swapaxes:
            buf = deepcopy(x)
            x = y
            y = buf
        if type == 'scatter':
            ax.scatter(x, y, label=l, alpha=al, color=c)
        elif type == 'plot':
            ax.plot(x, y, label=l, alpha=al, color=c, linestyle=ls, lw=linewidth)
    if labels is not None:
        ax.legend(fontsize='small', loc='center left', bbox_to_anchor=(1, 0.5), borderaxespad=0.2)
    if title:
        ax.set_title(title)
    if show:
        plt.show()

    return ax


[docs]def plot_for_quantities(scores: np.ndarray, gt_divisor, common_subset=False, legend=False, pair_num=None):
    ling_names = [n for n in scores.dtype.names if 'fqrng' not in n and 'ground_truth' not in n and 'model' in n
                  and '+' not in n]
    vis_names = [n for n in scores.dtype.names if 'fqrng' not in n and 'ground_truth' not in n and 'model' not in n
                 and '+' not in n]
    mm_names = []
    for vn in vis_names:
        mm_names.append([n for n in scores.dtype.names if 'fqrng' not in n and 'ground_truth' not in n and '+' in n
                        and vn in n])
    names = ling_names + vis_names + list(chain.from_iterable(mm_names))
    quantities = sorted(list(set([int(n.split('_')[1][1:]) for n in ling_names])))
    quantities = quantities[1:] + [quantities[0]]    # -1: max train file num
    scs = scores[names + ['ground_truth']]
    scs['ground_truth'] /= gt_divisor
    correlations = compute_correlations(scs, name_pairs='gt', common_subset=common_subset)

    # Plot data with error bars
    def bar_data(nms):
        means, errs, covs = [], [], []
        for q in quantities:
            qnames = [n for n in nms if f'n{q}_' in n]
            qcorrs, qpvals, qcoverages = zip(*[correlations['ground_truth | ' + n] for n in qnames])
            q_mean, q_std, q_cov = np.mean(qcorrs), np.std(qcorrs), np.mean(qcoverages)
            means.append(q_mean)
            errs.append(q_std)
            covs.append(100 * q_cov / pair_num)   # Return coverage in percentages
        return means, errs, covs

    ling_means, ling_errs, ling_covs = bar_data(ling_names)

    def coverage_texts(xpos, means, covs, errs):
        for xp, y, cov, err in zip(xpos, means, covs, errs):
            ax.text(xp, y + err + 0.02, str(int(cov)), fontsize=20, horizontalalignment='center', rotation=90)

    fig, ax = plt.subplots()
    bar_width = 0.35
    fontsize = 20
    xpos = np.linspace(1, 2 + 2 * len(vis_names), len(quantities))

    ax.bar(np.array(xpos), ling_means, yerr=ling_errs, width=bar_width, label='$E_L$')
    if not common_subset:
        coverage_texts(xpos, ling_means, ling_covs, ling_errs)
    pi = 1
    C = 1
    # separate MM for vis_names too
    for mmn, vn in zip(mm_names, vis_names):
        mmlabel = '$E_L$ + ' + Embeddings.get_emb_type_label(vn)
        mmn_means, mmn_errs, mmn_covs = bar_data(mmn)
        mmn_xpos = np.array(xpos) + pi * bar_width
        ax.bar(mmn_xpos, mmn_means, yerr=mmn_errs, width=bar_width, label=mmlabel, color=f'C{C}')
        if not common_subset:
            coverage_texts(mmn_xpos, mmn_means, mmn_covs, mmn_errs)
        C += 1
        pi += 1
    for vn in vis_names:
        vcorr, vpval, vcoverage = correlations['ground_truth | ' + vn]
        ax.plot([xpos[0], pi * bar_width * mmn_xpos[-1]], [vcorr, vcorr],
                label=Embeddings.get_emb_type_label(vn), color=f'C{C}')
        vcoverage = 100 * vcoverage / pair_num
        if not common_subset:
            ax.text(pi * bar_width * mmn_xpos[-1] + 0.02, vcorr, str(int(vcoverage)),
                    fontsize=20)
        C += 1
        # vxpos = np.array(xpos) + pi * bar_width
        # vcorrs = [vcorr for i in xpos]
        # verrs = [0 for i in xpos]
        # ax.bar(vxpos, vcorrs, yerr=verrs, width=bar_width, label=Embeddings.get_emb_type_label(vn))
        # coverage_texts(vxpos, vcorrs, [vcoverage for i in xpos], verrs)
        # pi += 1

    ax.set_xticks(xpos)
    ax.set_xticklabels(['8M', '1G', '2G', '5G', '13G'], fontsize=fontsize)
    ax.set_ylabel('Spearman correlation', fontsize=fontsize)
    plt.tick_params(axis='both', labelsize=fontsize)
    if legend:
        ax.legend(loc=(0, 1.08), fontsize=15, ncol=5, columnspacing=1)

    if common_subset:
        return int(vcoverage)
    else:
        return False


[docs]def plot_for_freqranges(scores: np.ndarray, gt_divisor, quantity=-1, common_subset=False, pair_num=None,
                        split_num=None, ds_name=None):
    names = [n for n in scores.dtype.names if 'fqrng' in n and f'n{quantity}' in n and 'ground_truth' not in n
             and 'common_subset' not in n]
    if split_num:
        names = [n for n in names if f'split{split_num}' in n and n.count('+') <= 1]
    if ds_name:
        names = [n for n in names if ds_name in n]
    mixed = [n for n in scores.dtype.names if 'n-1' in n and 'fqrng' not in n and 'ground_truth' not in n]
    ling_names = [n for n in names if 'model' in n and '+' not in n]
    vis_names = [n for n in scores.dtype.names if 'fqrng' not in n and 'ground_truth' not in n and 'model' not in n
                 and '+' not in n]
    mm_names = []
    for vn in vis_names:
        mm_names.append([n for n in names if 'model' in n and '+' in n and vn in n])

    names = ling_names + vis_names + list(chain.from_iterable(mm_names))
    scs = scores[names + ['ground_truth'] + mixed]
    scs['ground_truth'] /= gt_divisor
    correlations = compute_correlations(scs, name_pairs='gt', common_subset=common_subset)

    freq_ranges = sorted(list(set([tuple(map(int, n.split('_')[-1].split('-'))) for n in ling_names])))

    # Plot data with error bars
    def bar_data(nms, mixed_pattern):
        means, errs, covs = [], [], []
        for fmin, fmax in freq_ranges:
            fnames = [n for n in nms if f'fqrng_{fmin}-{fmax}' in n]
            fcorrs, fpvals, fcoverages = zip(*[correlations['ground_truth | ' + n] for n in fnames])
            f_mean, f_std, f_cov = np.mean(fcorrs), np.std(fcorrs), np.mean(fcoverages)
            means.append(f_mean)
            errs.append(f_std)
            covs.append(100 * f_cov / pair_num)
        # MIXED: Full data
        mcorrs, mpvals, mcoverages = zip(*[correlations['ground_truth | ' + n] for n in mixed
                                                                        if mixed_pattern(n)])
        m_mean, m_std, m_cov = np.mean(mcorrs), np.std(mcorrs), np.mean(mcoverages)
        means.append(m_mean)
        errs.append(m_std)
        covs.append(100 * m_cov / pair_num)
        return means, errs, covs

    def coverage_texts(xpos, means, covs, errs):
        for xp, y, cov, err in zip(xpos, means, covs, errs):
            ax.text(xp, y + err + 0.02, str(int(cov)), fontsize=20, horizontalalignment='center', rotation=90)

    ling_means, ling_errs, ling_covs = bar_data(ling_names, lambda x: '+' not in x)

    fig, ax = plt.subplots()
    bar_width = 0.35
    fontsize = 20
    xpos = np.linspace(1, 2 + 2 * len(vis_names), len(freq_ranges) + 1)

    ax.bar(np.array(xpos), ling_means, yerr=ling_errs, width=bar_width, label='$E_L$')
    coverage_texts(xpos, ling_means, ling_covs, ling_errs)
    pi = 1
    C = 1
    # separate MM for vis_names too
    for mmn, vn in zip(mm_names, vis_names):
        mmlabel = '$E_L$ + ' + Embeddings.get_emb_type_label(vn)
        mmn_means, mmn_errs, mmn_covs = bar_data(mmn, lambda x: vn in x)
        mmn_xpos = np.array(xpos) + pi * bar_width
        ax.bar(mmn_xpos, mmn_means, yerr=mmn_errs, width=bar_width, label=mmlabel, color=f'C{C}')
        coverage_texts(mmn_xpos, mmn_means, mmn_covs, mmn_errs)
        pi += 1
        C += 1
    for vn in vis_names:
        vcorr, vpval, vcoverage = correlations['ground_truth | ' + vn]
        ax.plot([xpos[0], pi * bar_width * mmn_xpos[-1]], [vcorr, vcorr],
                label=Embeddings.get_emb_type_label(vn), color=f'C{C}')
        vcoverage = 100 * vcoverage / pair_num
        ax.text(pi * bar_width * mmn_xpos[-1] + 0.02, vcorr, str(int(vcoverage)),
                fontsize=20)
        C += 1
        # vxpos = np.array(xpos) + pi * bar_width
        # vcorrs = [vcorr for i in xpos]
        # verrs = [0 for i in xpos]
        # vcoverage = 100 * vcoverage / pair_num
        # ax.bar(vxpos, vcorrs, yerr=[0 for i in xpos], width=bar_width, label=Embeddings.get_label(vn))
        # coverage_texts(vxpos, vcorrs, [vcoverage for i in xpos], verrs)
        # pi += 1

    ax.set_xticks(xpos)
    ax.set_xticklabels(['LOW', 'MEDIUM', 'HIGH', 'MIXED'], fontsize=fontsize)
    ax.set_ylabel('Spearman correlation', fontsize=fontsize)
    plt.tick_params(axis='both', labelsize=fontsize)
    ax.legend(loc=(0, 1.08), fontsize=15, ncol=5, columnspacing=1)


[docs]def wn_concreteness(word, similarity_fn=wn.path_similarity):
    """WordNet distance of a word from its root hypernym."""
    syns = wn.synsets(word)
    dists = [1 - similarity_fn(s, s.root_hypernyms()[0]) for s in syns]
    return np.median(dists), max(dists)


[docs]def wn_concreteness_for_pairs(word_pairs, synset_agg: str, similarity_fn=wn.path_similarity,
                              pair_score_agg='sum') -> (np.ndarray, np.ndarray):
    """Sort scores by first and second word's concreteness scores.
    :param pair_score_agg: 'sum' adds scores for the two words, 'diff' computes their absolute difference.
    :return (ids, scores): sorted score indices and concreteness scores.
    """
    synset_agg = {'median': 0, 'most_conc': 1}[synset_agg]
    concrete_scores = [(i, wn_concreteness(w1, similarity_fn)[synset_agg],
                        wn_concreteness(w2, similarity_fn)[synset_agg])
                       for i, (w1, w2) in enumerate(word_pairs)]
    # Sort for word pairs
    if pair_score_agg == 'sum':
        agg = lambda x: x[1] + x[2]
    elif pair_score_agg == 'diff':
        agg = lambda x: np.abs(x[1] - x[2])
    ids12 = [(i, agg([i, s1, s2])) for i, s1, s2 in sorted(concrete_scores, key=agg)]
    ids, scores = list(zip(*ids12))
    return np.array(ids), np.array(scores)


[docs]def plot_by_concreteness(scores: np.ndarray, word_pairs, ax1, ax2, common_subset=False, vecs_names=None,
                         concrete_num=100, title_prefix='', pair_score_agg='sum', show=False):
    """Plot scores for data splits with increasing concreteness."""
    for synset_agg, ax in zip(['median', 'most_conc'], [ax1, ax2]):
        corrs_by_conc = defaultdict(list)
        ids12, concs = wn_concreteness_for_pairs(word_pairs, synset_agg, pair_score_agg=pair_score_agg)
        scs = scores[ids12]
        for i in range(0, len(ids12), concrete_num):
            corrs = compute_correlations(scs[i:i + concrete_num], 'gt', common_subset=common_subset)
            for k, v in corrs.items():
                corrs_by_conc[k].append(v[0])  # Append correlations score for each embedding

        corrs_by_conc_a = dict2struct_array(corrs_by_conc)

        vnames = [n for n in corrs_by_conc_a.dtype.names if 'fmri' not in n and 'frcnn' not in n]
        labels = [Embeddings.get_label(n.split(NAME_DELIM)[1]) for n in vnames]

        colours, linestyles, alphas = PlotColour.colour_by_modality(labels)
        labelpad = 10

        # Concreteness scores on different axis but the same plot
        axn = ax
        axn.plot(concs, color='blue')
        axn.set_xlabel('Word pairs', labelpad=labelpad)
        axn.set_ylabel('WordNet concreteness', labelpad=labelpad)
        axn.yaxis.label.set_color('blue')
        # Xticklabels by step size
        n = scores.shape[0]
        step = 500
        xtlabels = [i for i in range(concrete_num, n) if i % step == 0] + [n]
        axn.xaxis.set_ticks([i - 1 for i in xtlabels])
        axn.set_xticklabels(xtlabels)

        # Plot for Spearman's correlations
        axp = axn.twiny().twinx()
        axp = plot_scores(corrs_by_conc_a,
                          vecs_names=vnames,
                          labels=None,
                          colours=colours,
                          linestyles=linestyles,
                          title='',
                          alphas=alphas,
                          xtick_labels=None,
                          ax=axp,
                          show=show)
        axp.set_ylabel("Spearman's correlation", labelpad=labelpad - 3)
        # TODO: Doesn't show, order of axn.twiny().twinx() matters...
        axp.set_xlabel('WordNet concreteness splits by 100 pairs', labelpad=labelpad)
        n = corrs_by_conc_a.shape[0]
        axp.xaxis.set_ticks([i for i in range(-1, n)])
        axp.set_xticklabels(['' for i in axp.get_xticklabels()])
        syna = {'median': 'Median', 'most_conc': 'Most Concrete'}[synset_agg]
        axp.set_title(f'{title_prefix} - Synset Agg {syna}')


[docs]def eval_concreteness(scores: np.ndarray, word_pairs, num=100, gt_divisor=10, vecs_names=None, tablefmt='simple'):
    """Eval dataset instances based on WordNet synsets."""

    # Sort scores by first and second word's concreteness scores
    def print_conc(synset_agg, title):
        ids12 = wn_concreteness_for_pairs(word_pairs, synset_agg)
        # plot_scores(scores[ids1], gt_divisor, vecs_names, title=title)
        # plot_scores(scores[ids2], gt_divisor, vecs_names, title=title)
        # plot_scores(scores[ids12][:100], gt_divisor, vecs_names, title=title + ' - 100 least concrete')
        # plot_scores(scores[ids12][-100:], gt_divisor, vecs_names, title=title + ' - 100 most concrete')
        print(f'\n-------- {num} least concrete - {title} -------\n')
        print_correlations(scores[ids12][:num], name_pairs='gt', common_subset=False, tablefmt=tablefmt)
        print(f'\n-------- {num} most concrete - {title} -------\n')
        print_correlations(scores[ids12][-num:], name_pairs='gt', common_subset=False, tablefmt=tablefmt)

    # plots both for median concreteness of synsets and for the most concrete synset of words
    print_conc('median', 'Median synset concreteness')
    print_conc('most_conc', 'Most concrete synsets')


[docs]def compute_scores(actions, embeddings, scores, datasets, pairs, brain_scores=None, pre_score_files: str = None,
                   ling_vecs_names=[], vecs_names=[], mm_lingvis=False, mm_embs_of: List[Tuple[str]] = None,
                   mm_padding=False, common_subset=False):
    """Compute scores on all evaluation datasets."""
    print(actions)
    embs = embeddings.embeddings
    vocabs = embeddings.vocabs
    names = embeddings.vecs_names

    # Create multi-modal embeddings if ligvis or specific embedding pairs are given
    if mm_lingvis or mm_embs_of:
        if mm_lingvis:  # TODO: test
            mm_labels = list(product(ling_vecs_names, vecs_names))
            emb_tuples = [(embs[names.index(ln)], embs[names.index(vn)]) for ln, vn in mm_labels]
            vocab_tuples = [(vocabs[names.index(ln)], vocabs[names.index(vn)]) for ln, vn in mm_labels]
        elif mm_embs_of:  # Create MM Embeddings based on the given embedding labels
            emb_tuples = [tuple(embs[names.index(l)] for l in t) for t in mm_embs_of]
            vocab_tuples = [tuple(vocabs[names.index(l)] for l in t) for t in mm_embs_of]
            mm_labels = [tuple(l for l in t) for t in mm_embs_of]

        mm_embeddings, mm_vocabs, mm_labels = mid_fusion(emb_tuples, vocab_tuples, mm_labels, mm_padding)
        embs += mm_embeddings
        vocabs += mm_vocabs
        names += mm_labels

    if 'compscores' in actions:  # SemSim scores
        for name, dataset in datasets.datasets.items():
            dscores, dpairs = eval_dataset(dataset, name, embs, vocabs, names)
            scores[name] = dscores
            pairs[name] = dpairs

        if pre_score_files:  # Load previously saved score files and add the new scores.
            print(f'Load {pre_score_files} and join with new scores...')
            for name, dataset in datasets.datasets.items():
                pre_scores = np.load(f'{pre_score_files}_{name}.npy', allow_pickle=True)
                scores[name] = utils.join_struct_arrays([pre_scores, scores[name]])

    if 'compbrain' in actions:  # Brain scores
        if common_subset:  # Intersection of all vocabs for two_vs_two and it filters out the common subset
            vocabs = [list(set.intersection(*map(set, vocabs))) for v in vocabs]
        for emb, vocab, name in zip(embs, vocabs, names):
            fMRI_scores, MEG_scores, length, fMRI_scores_avg, MEG_scores_avg, \
            fMRI_word_scores, MEG_word_scores = two_vs_two.run_test(embedding=emb, vocab=vocab)
            brain_scores[name] = {'fMRI': fMRI_scores, 'MEG': MEG_scores,
                                  'fMRI Avg': fMRI_scores_avg, 'MEG Avg': MEG_scores_avg,
                                  'length': length,
                                  'fMRI words': fMRI_word_scores, 'MEG words': MEG_word_scores}

        if pre_score_files:  # Load previously saved score files and add the new scores.
            with open(f'{pre_score_files}_brain.json', 'r') as f:
                pre_brain_scores = json.load(f)
                for pname, pbscores in pre_brain_scores.items():
                    brain_scores[name] = pbscores

    return scores, brain_scores, pairs


# TODO: Nicer parameter handling, with exception messages
[docs]@arg('-a', '--actions', nargs='+',
     choices=['printcorr', 'plotscores', 'concreteness', 'coverage', 'compscores', 'compbrain',
              'brainwords', 'printbraincorr', 'plot_quantity', 'plot_freqrange'], default='printcorr')
@arg('-lvns', '--ling_vecs_names', nargs='+', type=str,
     choices=['w2v13', 'wikinews', 'wikinews_sub', 'crawl', 'crawl_sub'], default=[])
@arg('-vns', '--vecs_names', nargs='+', type=str)
@arg('-plto', '--plot_orders', nargs='+', type=str)
@arg('-pltv', '--plot_vecs', nargs='+', type=str)
@arg('-mmembs', '--mm_embs_of', type=tuple_list)
@arg('-pcorr', '--print_corr_for', choices=['gt', 'all'], default='all')
def main(datadir, embdir: str = None, vecs_names=[], savepath=None, loadpath=None,
         actions=['plotcorr'], plot_orders=['ground_truth'], plot_vecs=[],
         ling_vecs_names=[], pre_score_files: str = None, mm_embs_of: List[Tuple[str]] = None,
         mm_lingvis=False, mm_padding=False, print_corr_for=None, common_subset=False,
         tablefmt: str = "simple", concrete_num=100, pair_score_agg='sum', quantity=-1):
    """
    :param actions: Choose from the following:
        'printcorr': Print correlation in tables on MEN and SimLex.
        'plotscores': Plot correlations on MEN and SimLex.
        'concreteness': Scores on caption_comsub Semantic Similarity dataset splits, ordered by pair_score_agg of
                        WordNet concreteness scores of the two words in every word pair.
                        Optional: mm_padding
        'coverage': Save coverages on similarity/relatedness/brain data.
        'compscores': Compute scores on similarity/relatedness evaluation datasets.
        'compbrain': Compute scores on brain evaluation datasets.
        'brainwords': Plot Qualitative analysis on words in the brain data.
        'printbraincorr': Print correlations on brain data.
        'plot_quantity': Plot similarity/relatedness result for text quantity ranges.
        'plot_freqrange': Plot similarity/relatedness result for wor frequency ranges.
    :param pair_score_agg: 'sum' or 'diff' of concreteness scores of word pairs.
    :param mm_lingvis: if True, create multi-modal embeddings, otherwise specific embedding pairs should be given.
    :param tablefmt: printed table format. 'simple' - terminal, 'latex_raw' - latex table.
    :param concrete_num: Plot of WordNet concreteness splits by concrete_num number of pairs.
    :param datadir: Path to directory which contains evaluation data (and embedding data if embdir is not given)
    :param vecs_names: List[str] Names of embeddings
    :param embdir: Path to directory which contains embedding files.
    :param savepath: Full path to the file to save scores without extension. None if there's no saving.
    :param loadpath: Full path to the files to load scores and brain results from without extension.
                     If None, they'll be computed.
    :param plot_orders: Performance plot ordered by similarity scores of these datasets or embeddings.
    :param plot_vecs: List[str] Names of embeddings to plot scores for.
    :param ling_vecs_names: List[str] Names of linguistic embeddings.
    :param pre_score_files: Previously saved score file path without extension, which the new scores will be merged with
    :param mm_embs_of: List of str tuples, where the tuples contain names of embeddings which are to
                       be concatenated into a multi-modal mid-fusion embedding.
    :param mm_padding: Default False. Multi-modal mid-fusion method. If true, all the vectors are kept from the embeddings' vocabularies.
                       Vector representations without a vector from another modality are padded with zeros.
    :param print_corr_for: 'gt' prints correlations scores for ground truth, 'all' prints scores between all
                            pairs of scores.
    :param common_subset: action printcorr: Print results for subests of the eval datasets which are covered by all
                          embeddings' vocabularies.
                          action compbarin: Compute brain scores for interection of vocabularies.

    """

    scores = {}
    pairs = {}
    brain_scores = {}

    datasets = DataSets(datadir)

    if loadpath:
        for name, dataset in datasets.datasets.items():
            score_file = f'{loadpath}_{name}.npy'
            if os.path.exists(score_file):
                scores[name] = np.load(f'{loadpath}_{name}.npy', allow_pickle=True)
        if os.path.exists(f'{loadpath}_brain.json'):
            with open(f'{loadpath}_brain.json', 'r') as f:
                brain_scores = json.load(f)

        # For printing tables
        if 'nopadding' in loadpath:
            title_pad = 'Intersection'
            fname_pad = 'nopadding'
        else:
            title_pad = 'Padding'
            fname_pad = 'padding'

        score_explanation0 = f'Multi-modal embeddings are created using the {title_pad} technique. ' + \
                            'The table sections contain linguistic, visual and multi-modal embeddings in this order. '

        score_explanation = score_explanation0 + \
                            'Red colour signifies the best performance, blue means that the multi-modal ' + \
                            'embedding outperformed the corresponding uni-modal ones.'
    else:
        if not embdir:
            embdir = datadir
        embeddings = Embeddings(embdir, vecs_names, ling_vecs_names)

    if 'compscores' in actions or 'compbrain' in actions:
        scores, brain_scores, pairs = compute_scores(
            actions, embeddings, scores, datasets, pairs, brain_scores=brain_scores,
            pre_score_files=pre_score_files, ling_vecs_names=ling_vecs_names, vecs_names=vecs_names,
            mm_lingvis=mm_lingvis, mm_embs_of=mm_embs_of, mm_padding=mm_padding, common_subset=common_subset)

    if 'plot_quantity' in actions:
        for name in list(scores.keys()):
            scrs = deepcopy(scores[name])
            coverage = plot_for_quantities(scrs, gt_divisor=datasets.normalizers[name], common_subset=common_subset,
                                legend=True, pair_num=datasets.pair_num[name])
            cstag = f'common_subset_{coverage}_' if common_subset else ''
            plt.savefig(f'../figs/quantities_lines_{cstag}' + name + '.png', bbox_inches='tight')

    if 'plot_freqrange' in actions:
        for name in list(scores.keys()):
            scrs = deepcopy(scores[name])
            plot_for_freqranges(scrs, gt_divisor=datasets.normalizers[name], common_subset=common_subset,
                                quantity=quantity, pair_num=datasets.pair_num[name], split_num=3, ds_name=name)
            cstag = 'common_subset_' if common_subset else ''
            plt.savefig(f'../figs/eqsplit_freqranges_lines_{cstag}' + name + '.png', bbox_inches='tight')

    if 'plotscores' in actions:
        for name in list(scores.keys()):
            scrs = deepcopy(scores[name])
            for plot_order in plot_orders:  # order similarity scores of these datasets or embeddings
                plot_scores(np.sort(scrs, order=plot_order), gt_divisor=datasets.normalizers[name],
                            vecs_names=plot_vecs + ['ground_truth'])

    if 'concreteness' in actions:
        with open(loadpath + '_pairs.json', 'r') as f:
            word_pairs = json.load(f)

        if common_subset:
            commonsub = 'commonsubset'
            caption_comsub = "the embeddings' common subset of"
        else:
            commonsub = 'full'
            caption_comsub = "the full"

        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        for i, (name, scrs) in enumerate(scores.items()):
            # print(f'\n-------- {name} scores -------\n')
            # eval_concreteness(scrs, word_pairs[name], num=concrete_num,
            #                   gt_divisor=datasets.normalizers[name], vecs_names=plot_vecs + ['ground_truth'],
            #                   tablefmt=tablefmt)
            plot_by_concreteness(scrs, word_pairs[name], axes[i][0], axes[i][1], common_subset=common_subset,
                                 vecs_names=plot_vecs + ['ground_truth'],
                                 concrete_num=concrete_num,
                                 title_prefix=name,
                                 pair_score_agg=pair_score_agg,
                                 show=False)

        legs, leglabels = PlotColour.get_legend()
        fig.legend(legs, leglabels, loc=9, edgecolor='inherit', ncol=7, borderaxespad=-0.2, numpoints=1)
        fig.tight_layout(pad=1.0)

        agg = {'sum': 'sum', 'diff': 'difference'}[pair_score_agg]
        fname = f'{commonsub}_{fname_pad}_{pair_score_agg}'
        fpath = 'figs/' + fname + '.png'
        latex_fig = '\\begin{figure}\n\centering\n' + \
                    '\includegraphics[width=\\textwidth]{figs/' + fname + '.png}' + \
                    '\n\caption{' + f'Scores on {caption_comsub} Semantic Similarity dataset splits, ordered by ' + \
                    f'the {agg} of WordNet concreteness scores of ' + \
                    f'the two words in every word pair. Mid-fusion method: {title_pad}.' + '}\n' + \
                    '\label{f:' + fname + '}' + \
                    '\n\end{figure}\n\n'
        # Save figure and figures tex file
        plt.savefig(fpath, bbox_inches='tight')
        with open('figs/figs_concreteness.tex', 'a+') as f:
            f.write(latex_fig)

    if 'brainwords' in actions:
        plot_brain_words(brain_scores, 'VG-VIS combined')

        if 'commonsubset' in loadpath:
            commonsub = 'commonsubset'
            caption_comsub = "the embeddings' common subset of"
        else:
            commonsub = 'full'
            caption_comsub = "the full"

        fname = f'brain_{commonsub}_{fname_pad}'
        fpath = 'figs/' + fname + '.png'
        latex_fig = '\\begin{figure}\n\centering\n' + \
                    '\includegraphics[width=\\textwidth]{figs/' + fname + '.png}' + \
                    '\n\caption{' + f'Scores on {caption_comsub} the Brain datasets words, ordered by ' + \
                    f'their WordNet concreteness. The scores are the number of hits per word, averaged over ' + \
                    f'all participants. Mid-fusion method: {title_pad}.' + '}\n' + \
                    '\label{f:' + fname + '}' + \
                    '\n\end{figure}\n\n'

        # Save figure and figures tex file
        plt.savefig(fpath, bbox_inches='tight')
        with open('figs/figs_brain.tex', 'a+') as f:
            f.write(latex_fig)

    if 'printcorr' in actions:
        if scores != {}:
            for name, scrs in scores.items():
                # print(f'\n-------- {name} scores -------\n')
                if common_subset:
                    caption = f'Spearman correlation on the common subset of the {name} dataset. '
                    commonsub = 'commonsubset'
                else:
                    caption = f'Spearman correlation on the {name} dataset. '
                    commonsub = 'full'
                print_correlations(scrs, name_pairs=print_corr_for, common_subset=common_subset,
                                   tablefmt=tablefmt, caption=caption + score_explanation,
                                   label='_'.join([name, commonsub, fname_pad]))

    if 'printcorr_subsample' in actions:
        if scores != {}:
            for name, scrs in scores.items():
                # print(f'\n-------- {name} scores -------\n')
                if common_subset:
                    caption = f'Cross-validated Spearman correlations on the common subset of the {name} dataset. '
                    commonsub = 'commonsubset'
                else:
                    caption = f'Cross-validated Spearman correlations on the {name} dataset. '
                    commonsub = 'full'
                caption += 'Spearman and P-value columns report $<$mean (STD)$>$ of three samples after leaving out the third of the evaluation pairs. '
                print_subsampled_correlations(scrs, name_pairs=print_corr_for, common_subset=common_subset,
                                   tablefmt=tablefmt, caption=caption + score_explanation0,
                                   label='_'.join([name, commonsub, fname_pad, 'crossval']), n_sample=3)

    if 'printbraincorr' in actions:
        if 'commonsubset' in loadpath:
            caption_suffix = ' on the common subset of vocabularies. '
            commonsub = 'commonsubset'
        else:
            caption_suffix = '. '
            commonsub = 'full'
        print_brain_scores(brain_scores, tablefmt=tablefmt, caption=score_explanation, suffix=caption_suffix,
                           label=f'{commonsub}_{fname_pad}')

    if 'coverage' in actions:
        for vocab, name in zip(embeddings.vocabs, embeddings.vecs_names):
            print('\n--------------' + name + '--------------\n')
            coverage(vocab, datasets)

    if savepath:
        print('Saving...')
        if scores != {}:
            for name, scs in scores.items():
                np.save(savepath + f'_{name}.npy', scs)
            with open(savepath + '_pairs.json', 'w') as f:
                json.dump(pairs, f)
        if brain_scores != {}:
            with open(savepath + '_brain.json', 'w') as f:
                json.dump(brain_scores, f)


if __name__ == '__main__':
    argh.dispatch_commands([main, divide_eval_vocab_by_freqranges])