Source code for test.test_unsupervised_metrics

import sys, os
sys.path.append(os.getcwd())
sys.path.append(os.getcwd() + '/source')

from source.unsupervised_metrics import *
import numpy as np


[docs]def test_order_words_by_centroid_distance():
    clusters = [(0, ['c', 'b', 'a']), (1, ['e', 'd'])]
    cl_labels = {'a': 0, 'b': 0.1, 'c': 1, 'd': 0.1, 'e': 0.2}
    cluster_label_filepath = 'test/data/test_cluster_labels.json'
    with open('test/data/dists_from_centr_labels.json', 'w') as f:
        json.dump(cl_labels, f)

    order_words_by_centroid_distance(clusters, cluster_label_filepath)
    assert clusters == [(0, ['a', 'b', 'c']), (1, ['d', 'e'])]


[docs]def test_distances_from_centroids():
    emb = np.array([[1, 0, 0],
                    [0, 1, 0],
                    [0, 0, 1],
                    [1, 1, 1]])
    vocab = ['a', 'b', 'c', 'd']
    centroids = np.array([[2, 0, 0],
                          [0, 4, 0]])
    label_dict = {'a': 0, 'b': 0, 'c': 1, 'd': 1}

    dists = distances_from_centroids(emb, vocab, label_dict, centroids)
    assert dists['a'] == 0.0
    assert dists['b'] == 1.0
    assert dists['c'] == 1.0
    assert np.isclose(dists['d'], 1 - (4 / (np.sqrt(3) * np.sqrt(16))))


[docs]def test_get_clustering_labels_metrics():
    """
    Test data which is loaded from 'test/data':
        test_model = np.array([[1, 1, 1],
                               [2, 2, 2],
                               [3, 3, 3],
                               [4, 4, 4]])
        test_vocab = ['a', 'b', 'c', 'd']
    """
    get_clustering_labels_metrics(['test_model'], datadir='test/data/',
                                  savedir='test/data/',
                                  cluster_method='kmeans', n_clusters=4, random_state=1, eps=0.5, min_samples=90,
                                  workers=1, suffix='')
    labels = np.load('test/data/cluster_labels_kmeans_test_model_nc4.npy')
    print(labels)
    assert len(set(labels)) == 4

    get_clustering_labels_metrics(['test_model'], datadir='test/data/',
                                  savedir='test/data/',
                                  cluster_method='kmeans', n_clusters=3, random_state=1, eps=0.5, min_samples=90,
                                  workers=1, suffix='')
    labels = np.load('test/data/cluster_labels_kmeans_test_model_nc3.npy')
    print(labels)
    assert len(set(labels)) == 3

    get_clustering_labels_metrics(['test_model'], datadir='test/data/',
                                  savedir='test/data/',
                                  cluster_method='kmeans', n_clusters=2, random_state=1, eps=0.5, min_samples=90,
                                  workers=1, suffix='')
    labels = np.load('test/data/cluster_labels_kmeans_test_model_nc2.npy')
    print(labels)
    assert len(set(labels)) == 2


[docs]def test_n_nearest_neighbors():
    vocab = np.array(['a', 'b', 'c', 'd', 'e'])
    words = np.array(['a', 'c', 'e'])
    E = np.array([[1, 0],
                  [0, 1],
                  [1, 1],
                  [1, 0.5],
                  [0.5, 1]])


    NN = get_n_nearest_neighbors(words, E, vocab, n=1)
    assert (NN[0, :] == words).all()
    assert (NN[1:, 0] == np.array(['d'])).all()
    assert (NN[1:, 1] == np.array(['d'])).all()

    NN = get_n_nearest_neighbors(words, E, vocab, n=3)
    assert (NN[0, :] == words).all()
    assert (NN[1:, 0] == np.array(['d', 'c', 'e'])).all()
    assert (NN[1:, 1] == np.array(['d', 'e', 'a'])).all()

    words = np.array(['b'])
    NN = get_n_nearest_neighbors(words, E, vocab, n=3)
    assert (NN[0, :] == words).all()
    assert (NN[1:, 0] == np.array(['e', 'c', 'd'])).all()

    words = np.array([])
    NN = get_n_nearest_neighbors(words, E, vocab, n=3)
    assert NN.size == 0