Source code for source.process_vg

import os
import json
from collections import Counter
import argh
from argh import arg
import spacy
from tqdm import tqdm

from source.text_process import pmi_for_words


[docs]def vg_dists(datadir='/Users/anitavero/projects/data/visualgenome'): with open(os.path.join(datadir, 'vg_contexts_rad3_lemmatised1.txt'), 'r') as f: ctx = [pair.split() for pair in tqdm(f.read().split('\n'))] words = [] for pair in ctx: if len(pair) < 2: print('MISSING', pair) else: words.append(pair[0]) with open(os.path.join(datadir, 'vg_contexts_rad3_lemmatised1_dists.json'), 'w') as f: json.dump(Counter(words), f)
[docs]@arg('-vs', '--variants', nargs='+', type=str, required=True) def vg_pmis(words_file, datadir='/Users/anitavero/projects/data/visualgenome', bigram_file='bigram_vg.pkl', variants=['ppmi']): """Save PMI scores for bigrams including words in file word_list. :param words_file: json file name in data_dir, consisting of an str list :param datadir: path to directory with data """ with open(os.path.join(datadir, words_file), 'r') as f: words = json.load(f) ctx = [] if bigram_file == None: with open(os.path.join(datadir, 'vg_contexts_rad3_lemmatised1.txt'), 'r') as f: ctx = [pair.split() for pair in tqdm(f.read().split('\n'), desc='Read VG contexts')] pmis = pmi_for_words(words, finder_file=os.path.join(datadir, bigram_file), document_list=ctx, variants=variants) print(f'Save {", ".join(variants)}') with open(os.path.join(datadir, words_file.replace('.', f'_VG_{"_".join(variants)}.')), 'w') as f: json.dump(pmis, f)
[docs]def description_corpus(region_descriptions, lemmatise): """Return all descriptions as a corpus in form of list of strings (sentences).""" nlp = spacy.load('en') corpus = [] for rg in tqdm(region_descriptions): if lemmatise: corpus += [' '.join([w.lemma_ for w in nlp(r['phrase']) if not w.is_punct]) for r in rg['regions']] else: corpus += [r['phrase'] for r in rg['regions']] return corpus
[docs]def save_description_corpus(datadir, lemmatise=True): region_descriptions = json.load(open(datadir + '/region_descriptions.json')) corpus = description_corpus(region_descriptions, lemmatise) with open(datadir + '/region_description_corpus.txt', 'w', encoding='utf-8') as f: f.write('\n'.join(corpus))
if __name__ == '__main__': argh.dispatch_commands([vg_dists, vg_pmis, save_description_corpus])