Source code for test.test_unsupervised_metrics

import sys, os
sys.path.append(os.getcwd())
sys.path.append(os.getcwd() + '/source')

from source.unsupervised_metrics import *
import numpy as np


[docs]def test_order_words_by_centroid_distance(): clusters = [(0, ['c', 'b', 'a']), (1, ['e', 'd'])] cl_labels = {'a': 0, 'b': 0.1, 'c': 1, 'd': 0.1, 'e': 0.2} cluster_label_filepath = 'test/data/test_cluster_labels.json' with open('test/data/dists_from_centr_labels.json', 'w') as f: json.dump(cl_labels, f) order_words_by_centroid_distance(clusters, cluster_label_filepath) assert clusters == [(0, ['a', 'b', 'c']), (1, ['d', 'e'])]
[docs]def test_distances_from_centroids(): emb = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 1, 1]]) vocab = ['a', 'b', 'c', 'd'] centroids = np.array([[2, 0, 0], [0, 4, 0]]) label_dict = {'a': 0, 'b': 0, 'c': 1, 'd': 1} dists = distances_from_centroids(emb, vocab, label_dict, centroids) assert dists['a'] == 0.0 assert dists['b'] == 1.0 assert dists['c'] == 1.0 assert np.isclose(dists['d'], 1 - (4 / (np.sqrt(3) * np.sqrt(16))))
[docs]def test_get_clustering_labels_metrics(): """ Test data which is loaded from 'test/data': test_model = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]]) test_vocab = ['a', 'b', 'c', 'd'] """ get_clustering_labels_metrics(['test_model'], datadir='test/data/', savedir='test/data/', cluster_method='kmeans', n_clusters=4, random_state=1, eps=0.5, min_samples=90, workers=1, suffix='') labels = np.load('test/data/cluster_labels_kmeans_test_model_nc4.npy') print(labels) assert len(set(labels)) == 4 get_clustering_labels_metrics(['test_model'], datadir='test/data/', savedir='test/data/', cluster_method='kmeans', n_clusters=3, random_state=1, eps=0.5, min_samples=90, workers=1, suffix='') labels = np.load('test/data/cluster_labels_kmeans_test_model_nc3.npy') print(labels) assert len(set(labels)) == 3 get_clustering_labels_metrics(['test_model'], datadir='test/data/', savedir='test/data/', cluster_method='kmeans', n_clusters=2, random_state=1, eps=0.5, min_samples=90, workers=1, suffix='') labels = np.load('test/data/cluster_labels_kmeans_test_model_nc2.npy') print(labels) assert len(set(labels)) == 2
[docs]def test_n_nearest_neighbors(): vocab = np.array(['a', 'b', 'c', 'd', 'e']) words = np.array(['a', 'c', 'e']) E = np.array([[1, 0], [0, 1], [1, 1], [1, 0.5], [0.5, 1]]) NN = get_n_nearest_neighbors(words, E, vocab, n=1) assert (NN[0, :] == words).all() assert (NN[1:, 0] == np.array(['d'])).all() assert (NN[1:, 1] == np.array(['d'])).all() NN = get_n_nearest_neighbors(words, E, vocab, n=3) assert (NN[0, :] == words).all() assert (NN[1:, 0] == np.array(['d', 'c', 'e'])).all() assert (NN[1:, 1] == np.array(['d', 'e', 'a'])).all() words = np.array(['b']) NN = get_n_nearest_neighbors(words, E, vocab, n=3) assert (NN[0, :] == words).all() assert (NN[1:, 0] == np.array(['e', 'c', 'd'])).all() words = np.array([]) NN = get_n_nearest_neighbors(words, E, vocab, n=3) assert NN.size == 0