Source code for source.train_word2vecf

import os
import argh
import subprocess
import time

import source.vecs2nps as vecs2nps


[docs]def train(contexts_file, save_dir, w2v_dir, filename_suffix='', min_count=10, size=300, negative=15, threads=4): """Perform the stepst to train word2vecf on a given corpus: 2. Create word and context vocabularies: ./myword2vec/count_and_filter -train dep.contexts -cvocab cv -wvocab wv -min-count 100 This will count the words and contexts in dep.contexts, discard either words or contexts appearing < 100 times, and write the counted words to `wv` and the counted contexts to `cv`. 3. Train the embeddings: ./myword2vec/word2vecf -train dep.contexts -wvocab wv -cvocab cv -output dim200vecs -size 200 -negative 15 -threads 10 This will train 200-dim embeddings based on `dep.contexts`, `wv` and `cv` (lines in `dep.contexts` with word not in `wv` or context not in `cv` are ignored). The -dumpcv flag can be used in order to dump the trained context-vectors as well. ./myword2vec/word2vecf -train dep.contexts -wvocab wv -cvocab cv -output dim200vecs -size 200 -negative 15 -threads 10 -dumpcv dim200context-vecs 4. convert the embeddings to numpy-readable format. """ # # 1. Create input data, which is in the form of (word,context) pairs. # print('Create context pairs') # contexts_file = os.path.join(save_dir, f'context_pairs{filename_suffix}.txt') # text2w2vf(corpus, contexts_file, window, vocab, threads) start = time.time() # 2. Create word and context vocabularies print('Create vocabularies') cv = os.path.join(save_dir, f'cv_{filename_suffix}') wv = os.path.join(save_dir, f'wv_{filename_suffix}') output = subprocess.run( [f'{w2v_dir}/count_and_filter', '-train', contexts_file, '-cvocab', cv, '-wvocab', wv, '-min-count', str(min_count)], stdout=subprocess.PIPE) print(output.stdout.decode('utf-8')) with open(os.path.join(save_dir, f'trainlog{filename_suffix}.log'), 'w') as f: f.write(output.stdout.decode('utf-8')) # 3. Train the embeddings print('Train the embeddings') modelfn = os.path.join(save_dir, f'model{filename_suffix}') contextvecs = os.path.join(save_dir, f'context-vecs{filename_suffix}') output = subprocess.run( [f'{w2v_dir}/word2vecf', '-train', contexts_file, '-cvocab', cv, '-wvocab', wv, '-output', modelfn, '-size', str(size), '-negative', str(negative), '-threads', str(threads), '-dumpcv', contextvecs], stdout=subprocess.PIPE) print(output.stdout.decode('utf-8')) end = time.time() etime = end - start hours = etime // 3600 % 24 minutes = etime // 60 % 60 seconds = etime % 60 print(f'Training time: {hours}h {minutes}m {seconds}s') with open(os.path.join(save_dir, f'trainlog{filename_suffix}.log'), 'a') as f: f.write('\nTrain:\n') f.write(output.stdout.decode('utf-8')) # Remove the huge concatenated context file after training print(f'Removing {contexts_file}') os.remove(contexts_file) # 4. Convert the embeddings to numpy-readable format. print('Convert the embeddings to numpy-readable format') vecs2nps.main(modelfn, modelfn)
if __name__ == '__main__': argh.dispatch_command(train)