Source code for ethically.we.utils

import math

import gensim
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from six import string_types
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score

WORD_EMBEDDING_MODEL_TYPES = (gensim.models.keyedvectors.KeyedVectors,
                              gensim.models.base_any2vec.BaseWordEmbeddingsModel,)  # pylint: disable=line-too-long

def round_to_extreme(value, digits=2):
    place = 10**digits
    new_value = math.ceil(abs(value) * place) / place
    if value < 0:
        new_value = -new_value
    return new_value

[docs]def normalize(v): """Normalize a 1-D vector.""" if v.ndim != 1: raise ValueError('v should be 1-D, {}-D was given'.format( v.ndim)) norm = np.linalg.norm(v) if norm == 0: return v return v / norm
[docs]def cosine_similarity(v, u): """Calculate the cosine similarity between two vectors.""" v_norm = np.linalg.norm(v) u_norm = np.linalg.norm(u) similarity = v @ u / (v_norm * u_norm) return similarity
[docs]def project_vector(v, u): """Projecting the vector v onto direction u.""" normalize_u = normalize(u) return (v @ normalize_u) * normalize_u
[docs]def reject_vector(v, u): """Rejecting the vector v onto direction u.""" return v - project_vector(v, u)
[docs]def project_reject_vector(v, u): """Projecting and rejecting the vector v onto direction u.""" projected_vector = project_vector(v, u) rejected_vector = v - projected_vector return projected_vector, rejected_vector
[docs]def project_params(u, v): """Projecting and rejecting the vector v onto direction u with scalar.""" normalize_u = normalize(u) projection = (v @ normalize_u) projected_vector = projection * normalize_u rejected_vector = v - projected_vector return projection, projected_vector, rejected_vector
[docs]def cosine_similarities_by_words(model, word, words): """Compute cosine similarities between a word and a set of other words.""" assert isinstance(word, string_types), \ 'The arguemnt `word` should be a string.' assert not isinstance(words, string_types), \ 'The argument `words` should not be a string.' vec = model[word] vecs = [model[w] for w in words] return model.cosine_similarities(vec, vecs)
def update_word_vector(model, word, new_vector): model.vectors[model.vocab[word].index] = new_vector if model.vectors_norm is not None: model.vectors_norm[model.vocab[word].index] = normalize(new_vector) def generate_one_word_forms(word): return [word.lower(), word.upper(), word.title()] def generate_words_forms(words): return sum([generate_one_word_forms(word) for word in words], []) def take_two_sides_extreme_sorted(df, n_extreme, part_column=None, head_value='', tail_value=''): head_df = df.head(n_extreme)[:] tail_df = df.tail(n_extreme)[:] if part_column is not None: head_df[part_column] = head_value tail_df[part_column] = tail_value return (pd.concat([head_df, tail_df]) .drop_duplicates() .reset_index(drop=True)) def assert_gensim_keyed_vectors(model): if not isinstance(model, WORD_EMBEDDING_MODEL_TYPES): type_names = (model_type.__name__ for model_type in WORD_EMBEDDING_MODEL_TYPES) raise TypeError('model should be on of the types' ' ({}), not {}.' .format(', '.join(type_names), type(model)))
[docs]def most_similar(model, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None, unrestricted=True): """ Find the top-N most similar words. Positive words contribute positively towards the similarity, negative words negatively. This function computes cosine similarity between a simple mean of the projection weight vectors of the given words and the vectors for each word in the model. The function corresponds to the `word-analogy` and `distance` scripts in the original word2vec implementation. Based on Gensim implementation. :param model: Word embedding model of ``gensim.model.KeyedVectors``. :param list positive: List of words that contribute positively. :param list negative: List of words that contribute negatively. :param int topn: Number of top-N similar words to return. :param int restrict_vocab: Optional integer which limits the range of vectors which are searched for most-similar values. For example, restrict_vocab=10000 would only check the first 10000 word vectors in the vocabulary order. (This may be meaningful if you've sorted the vocabulary by descending frequency.) :param bool unrestricted: Whether to restricted the most similar words to be not from the positive or negative word list. :return: Sequence of (word, similarity). """ if topn is not None and topn < 1: return [] if positive is None: positive = [] if negative is None: negative = [] model.init_sims() if (isinstance(positive, string_types) and not negative): # allow calls like most_similar('dog'), # as a shorthand for most_similar(['dog']) positive = [positive] if ((isinstance(positive, string_types) and negative) or (isinstance(negative, string_types) and positive)): raise ValueError('If positives and negatives are given, ' 'both should be lists!') # add weights for each word, if not already present; # default to 1.0 for positive and -1.0 for negative words positive = [ (word, 1.0) if isinstance(word, string_types + (np.ndarray,)) else word for word in positive ] negative = [ (word, -1.0) if isinstance(word, string_types + (np.ndarray,)) else word for word in negative ] # compute the weighted average of all words all_words, mean = set(), [] for word, weight in positive + negative: if isinstance(word, np.ndarray): mean.append(weight * word) else: mean.append(weight * model.word_vec(word, use_norm=True)) if word in model.vocab: all_words.add(model.vocab[word].index) if not mean: raise ValueError("Cannot compute similarity with no input.") mean = gensim.matutils.unitvec(np.array(mean) .mean(axis=0)).astype(float) if indexer is not None: return indexer.most_similar(mean, topn) limited = (model.vectors_norm if restrict_vocab is None else model.vectors_norm[:restrict_vocab]) dists = limited @ mean if topn is None: return dists best = gensim.matutils.argsort(dists, topn=topn + len(all_words), reverse=True) # if not unrestricted, then ignore (don't return) # words from the input result = [(model.index2word[sim], float(dists[sim])) for sim in best if unrestricted or sim not in all_words] return result[:topn]
def get_seed_vector(seed, bias_word_embedding): if seed == 'direction': positive_end = bias_word_embedding.positive_end negative_end = bias_word_embedding.negative_end bias_word_embedding._is_direction_identified() # pylint: disable=protected-access seed_vector = bias_word_embedding.direction else: if seed == 'ends': positive_end = bias_word_embedding.positive_end negative_end = bias_word_embedding.negative_end else: positive_end, negative_end = seed seed_vector = normalize(bias_word_embedding.model[positive_end] - bias_word_embedding.model[negative_end]) return seed_vector, positive_end, negative_end def plot_clustering_as_classification(X, y_true, random_state=1, ax=None): if ax is None: _, ax = plt.subplots(figsize=(10, 5)) y_cluster = (KMeans(n_clusters=2, random_state=random_state) .fit_predict(X)) embedded_vectors = (TSNE(n_components=2, random_state=random_state) .fit_transform(X)) for y_value in np.unique(y_cluster): mask = (y_cluster == y_value) label = 'Positive' if y_value else 'Negative' ax.scatter(embedded_vectors[mask, 0], embedded_vectors[mask, 1], label=label) ax.legend() acc = accuracy_score(y_true, y_cluster) return max(acc, 1 - acc)