twobob/levicon.py

## levicon.py
from collections import Counter
# Provided text
import os

import glob

# Define the directory path
dir_path = r"E:\Dubstep_diffusion\tracks"

# List all .wav and .mp3 files using glob with wildcard matching
text = glob.glob(dir_path + '/*.wav') + glob.glob(dir_path + '/*.mp3')

# If you want to print or use the list of files
for item in text:
    #print(item)
    pass

# Split the text by lines

lines = text

# Extract words and clean up

import re

all_words = set()

# Counter to store word occurrences
word_count = Counter()

for line in lines:
    # Remove the file extension and the key at the end
    cleaned_line = line.rsplit('.', 1)[0].rsplit('=', 1)[0]

    # Remove unwanted characters
    cleaned_line = re.sub(r"[()\[\],.]", "", cleaned_line)  # Removes (, ), [, ], commas, and periods

    # Replace underscores with spaces and convert to upper case
    cleaned_line = cleaned_line.replace('_', ' ').upper()

    # Replace multiple spaces with a single space
    cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()

    # Split into words
    words = cleaned_line.split()

    # Update the Counter with the words from this line
    word_count.update(words)

# Function to load stopwords from a file
def load_stopwords(filename):
    with open(filename, 'r') as file:
        return set(word.strip().upper() for word in file)

# Load stopwords from file
stopwords = load_stopwords('title_parse_stopwords.txt')

# Filter out words that appear more than once, are longer than one character, and do not include '\\TRACKS\\' (note the upper case)
filtered_words = {word: count for word, count in word_count.items() if count > 1
                  and len(word) > 2
                  and '\\' not in word
                  and word not in stopwords
                  and not re.match(r'^\d+BPM$', word)}

# Sort the words by their frequency in descending order
sorted_filtered_words = sorted(filtered_words.items(), key=lambda item: item[1], reverse=True)
print(sorted_filtered_words.__len__())
# Display filter words
sorted_filtered_words

# Save sorted_filtered_words to lexicon.txt
with open('lexicon.txt', 'w') as file:
    for word, count in sorted_filtered_words:
        file.write(f"{word}: {count}\n")

print("Lexicon has been saved to 'lexicon.txt'")

# Optional: Display the contents of the file
#print("\nContents of lexicon.txt:")
#with open('lexicon.txt', 'r') as file:
#    print(file.read())

from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Convert the list of tuples to a dictionary
word_freq = dict(sorted_filtered_words)

# Create and generate a word cloud image
wordcloud = WordCloud(width=4096, max_words=600, height=2048, background_color='white').generate_from_frequencies(word_freq)

# Display the generated image
#plt.figure(figsize=(10, 5))
#plt.imshow(wordcloud, interpolation='bilinear')
#plt.axis('off')
#plt.show()

# Save the word cloud as an image file
wordcloud.to_file("wordcloud.png")

## title_parse_stopwords.txt
2020
ABM
AGAIN
ALL
ALWAYS
AND
APOCALYPTIC
ARE
BARRY
BENT
BOTTOM
BUILDS
CAN
CANDY
CATCH
CAR
COLD
COMPLETE
COOL
CUT
CUTS
DEEPER
DESTROYER
DETAIL
DONT
EDIT
FEAT
FEEL
FING
FIREFLIES
FOR
FOREVER
FRAGMENTS
GENERAL
GUEST
HAVE
HEAR
HEAT
HEBETUDE
HITHAT
HOME
HOW
JOHN
KIND
KNOW
LIFE
LIKE
MAKE
MIND
MORE
MORNING
MOTION
NAVSTA
NEW
NGOSI
NISTRUM
NOW
ONLY
ORB
OUR
PAX
ROD
SAID
SCARRZ
SECTION
SPAGHETTI
STUFF
SUBSYST
SUGAR
SUN
SUNCASTLE
SUNSHINE
SUPER
SUPPLY
TAKE
TEXTUREDGROOVES
THAT
THE
TILL
TIME
TWO
TWOBOB
TWOBOBS
VAST
VIBRO
WAY
WITH
WITHOUT
WHAT
WHEN
WHERE
WHO
#WORD
WORLD
YOU
YOUR
YOURE
ZENITH
#
END
CREEPY
CRYPT
DOUBLEKICK
PURE
WORD
THREEFOUR
EIGTH
PROG
THEATRICAL
MANGA
AUDIO
LISTENING
MACHINES
BAND
SPACES
BIG
BOUNCY
ROBOT
CUE
DRIVEN
TUNE
WASTED
OFFBEATS
CLOCKWORK
DRIFTING
TARANTINO
HIGHNOON
STANDOFF
STYLISED
DAWN
LOTR
RELIGIOUS
RESOLUTION
REMIXED
MURDER
BENDY
STAR
BLAZING
BLING
POETRY
BOSON
SPIN
ROOM
VIP
COLORFUL
SHINING
RAFIQI
SHORT
STATIS
SUBMARINE
DEVRAS
PLEXI
COMPRESS
DJINTRO
VIRTUSO
EVERYTHING
SONG
ANDREWS
BEEFHEART
SIGNALS
QUIVER
INDUCTIVEONE
LAWLER
NAVSTA
REMASTERED
NUMBERSINTHEDREAM
SPAGHETTIWESTERN
FORWARD
MAD
WAITING
TIRED
	from collections import Counter
	# Provided text
	import os

	import glob

	# Define the directory path
	dir_path = r"E:\Dubstep_diffusion\tracks"

	# List all .wav and .mp3 files using glob with wildcard matching
	text = glob.glob(dir_path + '/.wav') + glob.glob(dir_path + '/.mp3')

	# If you want to print or use the list of files
	for item in text:
	#print(item)
	pass

	# Split the text by lines

	lines = text

	# Extract words and clean up

	import re

	all_words = set()

	# Counter to store word occurrences
	word_count = Counter()

	for line in lines:
	# Remove the file extension and the key at the end
	cleaned_line = line.rsplit('.', 1)[0].rsplit('=', 1)[0]

	# Remove unwanted characters
	cleaned_line = re.sub(r"[()\[\],.]", "", cleaned_line) # Removes (, ), [, ], commas, and periods

	# Replace underscores with spaces and convert to upper case
	cleaned_line = cleaned_line.replace('_', ' ').upper()

	# Replace multiple spaces with a single space
	cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip()

	# Split into words
	words = cleaned_line.split()

	# Update the Counter with the words from this line
	word_count.update(words)

	# Function to load stopwords from a file
	def load_stopwords(filename):
	with open(filename, 'r') as file:
	return set(word.strip().upper() for word in file)

	# Load stopwords from file
	stopwords = load_stopwords('title_parse_stopwords.txt')

	# Filter out words that appear more than once, are longer than one character, and do not include '\\TRACKS\\' (note the upper case)
	filtered_words = {word: count for word, count in word_count.items() if count > 1
	and len(word) > 2
	and '\\' not in word
	and word not in stopwords
	and not re.match(r'^\d+BPM$', word)}

	# Sort the words by their frequency in descending order
	sorted_filtered_words = sorted(filtered_words.items(), key=lambda item: item[1], reverse=True)
	print(sorted_filtered_words.__len__())
	# Display filter words
	sorted_filtered_words

	# Save sorted_filtered_words to lexicon.txt
	with open('lexicon.txt', 'w') as file:
	for word, count in sorted_filtered_words:
	file.write(f"{word}: {count}\n")

	print("Lexicon has been saved to 'lexicon.txt'")

	# Optional: Display the contents of the file
	#print("\nContents of lexicon.txt:")
	#with open('lexicon.txt', 'r') as file:
	# print(file.read())

	from wordcloud import WordCloud
	import matplotlib.pyplot as plt
	# Convert the list of tuples to a dictionary
	word_freq = dict(sorted_filtered_words)

	# Create and generate a word cloud image
	wordcloud = WordCloud(width=4096, max_words=600, height=2048, background_color='white').generate_from_frequencies(word_freq)

	# Display the generated image
	#plt.figure(figsize=(10, 5))
	#plt.imshow(wordcloud, interpolation='bilinear')
	#plt.axis('off')
	#plt.show()

	# Save the word cloud as an image file
	wordcloud.to_file("wordcloud.png")
	2020
	ABM
	AGAIN
	ALL
	ALWAYS
	AND
	APOCALYPTIC
	ARE
	BARRY
	BENT
	BOTTOM
	BUILDS
	CAN
	CANDY
	CATCH
	CAR
	COLD
	COMPLETE
	COOL
	CUT
	CUTS
	DEEPER
	DESTROYER
	DETAIL
	DONT
	EDIT
	FEAT
	FEEL
	FING
	FIREFLIES
	FOR
	FOREVER
	FRAGMENTS
	GENERAL
	GUEST
	HAVE
	HEAR
	HEAT
	HEBETUDE
	HITHAT
	HOME
	HOW
	JOHN
	KIND
	KNOW
	LIFE
	LIKE
	MAKE
	MIND
	MORE
	MORNING
	MOTION
	NAVSTA
	NEW
	NGOSI
	NISTRUM
	NOW
	ONLY
	ORB
	OUR
	PAX
	ROD
	SAID
	SCARRZ
	SECTION
	SPAGHETTI
	STUFF
	SUBSYST
	SUGAR
	SUN
	SUNCASTLE
	SUNSHINE
	SUPER
	SUPPLY
	TAKE
	TEXTUREDGROOVES
	THAT
	THE
	TILL
	TIME
	TWO
	TWOBOB
	TWOBOBS
	VAST
	VIBRO
	WAY
	WITH
	WITHOUT
	WHAT
	WHEN
	WHERE
	WHO
	#WORD
	WORLD
	YOU
	YOUR
	YOURE
	ZENITH
	#
	END
	CREEPY
	CRYPT
	DOUBLEKICK
	PURE
	WORD
	THREEFOUR
	EIGTH
	PROG
	THEATRICAL
	MANGA
	AUDIO
	LISTENING
	MACHINES
	BAND
	SPACES
	BIG
	BOUNCY
	ROBOT
	CUE
	DRIVEN
	TUNE
	WASTED
	OFFBEATS
	CLOCKWORK
	DRIFTING
	TARANTINO
	HIGHNOON
	STANDOFF
	STYLISED
	DAWN
	LOTR
	RELIGIOUS
	RESOLUTION
	REMIXED
	MURDER
	BENDY
	STAR
	BLAZING
	BLING
	POETRY
	BOSON
	SPIN
	ROOM
	VIP
	COLORFUL
	SHINING
	RAFIQI
	SHORT
	STATIS
	SUBMARINE
	DEVRAS
	PLEXI
	COMPRESS
	DJINTRO
	VIRTUSO
	EVERYTHING
	SONG
	ANDREWS
	BEEFHEART
	SIGNALS
	QUIVER
	INDUCTIVEONE
	LAWLER
	NAVSTA
	REMASTERED
	NUMBERSINTHEDREAM
	SPAGHETTIWESTERN
	FORWARD
	MAD
	WAITING
	TIRED