quadrismegistus/gensim_word2vec_measure_semantic_shift_by_local_neighborhood.py

## gensim_word2vec_measure_semantic_shift_by_local_neighborhood.py
def measure_semantic_shift_by_neighborhood(model1,model2,word,k=25,verbose=False):
    """
    Basic implementation of William Hamilton (@williamleif) et al's measure of semantic change
    proposed in their paper "Cultural Shift or Linguistic Drift?" (https://arxiv.org/abs/1606.02821),
    which they call the "local neighborhood measure." They find this measure better suited to understand
    the semantic change of nouns owing to "cultural shift," or changes in meaning "local" to that word,
    rather than global changes in language ("linguistic drift") use that are better suited to a
    Procrustes-alignment method (also described in the same paper.)

    Arguments are:
    - `model1`, `model2`: Are gensim word2vec models.
    - `word` is a sting representation of a given word.
    - `k` is the size of the word's neighborhood (# of its closest words in its vector space).
    """
    # Import function for cosine distance
    from scipy.spatial.distance import cosine

    # Check that this word is present in both models
    if not word in model1.vocab or not word in model2.vocab:
        print("!! Word %s not present in both models." % word)
        return None

    # Get the two neighborhoods
    neighborhood1 = [w for w,c in model1.most_similar(word,topn=k)]
    neighborhood2 = [w for w,c in model2.most_similar(word,topn=k)]

    # Print?
    if verbose:
        print('>> Neighborhood of associations of the word "%s" in model1:' % word)
        print(', '.join(neighborhood1))
        print()
        print('>> Neighborhood of associations of the word "%s" in model2:' % word)
        print(', '.join(neighborhood2))

    # Get the 'meta' neighborhood (both combined)
    meta_neighborhood = list(set(neighborhood1)|set(neighborhood2))

    # Filter the meta neighborhood so that it contains only words present in both models
    meta_neighborhood = [w for w in meta_neighborhood if w in model1.vocab and w in model2.vocab]

    # For both models, get a similarity vector between the focus word and all of the words in the meta neighborhood
    vector1 = [model1.similarity(word,w) for w in meta_neighborhood]
    vector2 = [model2.similarity(word,w) for w in meta_neighborhood]

    # Compute the cosine distance *between* those similarity vectors
    dist=cosine(vector1,vector2)

    # Return this cosine distance -- a measure of the relative semantic shift for this word between these two models
    return dist


"""
Example usage:
model1 = [a gensim model I have for text published in the 1750s]
model2 = [a gensim model I have for text published in the 1850s]
# The word 'god' does not change much in meaning:
    In [61]: measure_semantic_shift_by_neighborhood(model1,model2,'god',k=10,verbose=True)

    >> Neighborhood of associations of the word "god" in model1:
    almighty, jehovah, creator, uncreated, omniscient, logos, righteousness, christ, redeemer, salvation
    >> Neighborhood of associations of the word "god" in model2:
    almighty, heaven, jehovah, creator, redeemer, christ, divine, righteousness, providence, saviour

    Out[61]: 0.011609088245951749
# The word 'matter' does, moving from meaning mainly the "matter" of the universe to "what is the matter":
    In [62]: measure_semantic_shift_by_neighborhood(model1,model2,'matter',k=10,verbose=True)

    >> Neighborhood of associations of the word "matter" in model1:
    cohesion, sediment, menstruum, purulent, conceivable, gelatinous, morbific, compression, cerebellum, divisible
    >> Neighborhood of associations of the word "matter" in model2:
    matters, question, subject, affair, substance, concernment, concerns, questions, controversy, discussion

    Out[62]: 0.0847526073498025
# The word 'station' changes even more, moving from meaning one's social rank or "station", to a train station:
    In [63]: measure_semantic_shift_by_neighborhood(model1,model2,'station',k=10,verbose=True)

    >> Neighborhood of associations of the word "station" in model1:
    stations, dation, sphere, employments, deg, vocation, personate, lowest, district, apprenticeship
    >> Neighborhood of associations of the word "station" in model2:
    stations, train, posts, position, situation, town, carriage, stationed, rank, cab

    Out[63]: 0.14173381265358098
"""
	def measure_semantic_shift_by_neighborhood(model1,model2,word,k=25,verbose=False):
	"""
	Basic implementation of William Hamilton (@williamleif) et al's measure of semantic change
	proposed in their paper "Cultural Shift or Linguistic Drift?" (https://arxiv.org/abs/1606.02821),
	which they call the "local neighborhood measure." They find this measure better suited to understand
	the semantic change of nouns owing to "cultural shift," or changes in meaning "local" to that word,
	rather than global changes in language ("linguistic drift") use that are better suited to a
	Procrustes-alignment method (also described in the same paper.)

	Arguments are:
	- `model1`, `model2`: Are gensim word2vec models.
	- `word` is a sting representation of a given word.
	- `k` is the size of the word's neighborhood (# of its closest words in its vector space).
	"""
	# Import function for cosine distance
	from scipy.spatial.distance import cosine

	# Check that this word is present in both models
	if not word in model1.vocab or not word in model2.vocab:
	print("!! Word %s not present in both models." % word)
	return None

	# Get the two neighborhoods
	neighborhood1 = [w for w,c in model1.most_similar(word,topn=k)]
	neighborhood2 = [w for w,c in model2.most_similar(word,topn=k)]

	# Print?
	if verbose:
	print('>> Neighborhood of associations of the word "%s" in model1:' % word)
	print(', '.join(neighborhood1))
	print()
	print('>> Neighborhood of associations of the word "%s" in model2:' % word)
	print(', '.join(neighborhood2))

	# Get the 'meta' neighborhood (both combined)
	meta_neighborhood = list(set(neighborhood1)\|set(neighborhood2))

	# Filter the meta neighborhood so that it contains only words present in both models
	meta_neighborhood = [w for w in meta_neighborhood if w in model1.vocab and w in model2.vocab]

	# For both models, get a similarity vector between the focus word and all of the words in the meta neighborhood
	vector1 = [model1.similarity(word,w) for w in meta_neighborhood]
	vector2 = [model2.similarity(word,w) for w in meta_neighborhood]

	# Compute the cosine distance between those similarity vectors
	dist=cosine(vector1,vector2)

	# Return this cosine distance -- a measure of the relative semantic shift for this word between these two models
	return dist



	"""
	Example usage:
	model1 = [a gensim model I have for text published in the 1750s]
	model2 = [a gensim model I have for text published in the 1850s]
	# The word 'god' does not change much in meaning:
	In [61]: measure_semantic_shift_by_neighborhood(model1,model2,'god',k=10,verbose=True)

	>> Neighborhood of associations of the word "god" in model1:
	almighty, jehovah, creator, uncreated, omniscient, logos, righteousness, christ, redeemer, salvation
	>> Neighborhood of associations of the word "god" in model2:
	almighty, heaven, jehovah, creator, redeemer, christ, divine, righteousness, providence, saviour

	Out[61]: 0.011609088245951749
	# The word 'matter' does, moving from meaning mainly the "matter" of the universe to "what is the matter":
	In [62]: measure_semantic_shift_by_neighborhood(model1,model2,'matter',k=10,verbose=True)

	>> Neighborhood of associations of the word "matter" in model1:
	cohesion, sediment, menstruum, purulent, conceivable, gelatinous, morbific, compression, cerebellum, divisible
	>> Neighborhood of associations of the word "matter" in model2:
	matters, question, subject, affair, substance, concernment, concerns, questions, controversy, discussion

	Out[62]: 0.0847526073498025
	# The word 'station' changes even more, moving from meaning one's social rank or "station", to a train station:
	In [63]: measure_semantic_shift_by_neighborhood(model1,model2,'station',k=10,verbose=True)

	>> Neighborhood of associations of the word "station" in model1:
	stations, dation, sphere, employments, deg, vocation, personate, lowest, district, apprenticeship
	>> Neighborhood of associations of the word "station" in model2:
	stations, train, posts, position, situation, town, carriage, stationed, rank, cab

	Out[63]: 0.14173381265358098
	"""