Created
September 23, 2024 01:32
-
-
Save twobob/be412b69060fb49296554623060d8792 to your computer and use it in GitHub Desktop.
creating meaningful categories based on classifications parsed from a song name
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
# Provided text | |
import os | |
import glob | |
# Define the directory path | |
dir_path = r"E:\Dubstep_diffusion\tracks" | |
# List all .wav and .mp3 files using glob with wildcard matching | |
text = glob.glob(dir_path + '/*.wav') + glob.glob(dir_path + '/*.mp3') | |
# If you want to print or use the list of files | |
for item in text: | |
#print(item) | |
pass | |
# Split the text by lines | |
lines = text | |
# Extract words and clean up | |
import re | |
all_words = set() | |
# Counter to store word occurrences | |
word_count = Counter() | |
for line in lines: | |
# Remove the file extension and the key at the end | |
cleaned_line = line.rsplit('.', 1)[0].rsplit('=', 1)[0] | |
# Remove unwanted characters | |
cleaned_line = re.sub(r"[()\[\],.]", "", cleaned_line) # Removes (, ), [, ], commas, and periods | |
# Replace underscores with spaces and convert to upper case | |
cleaned_line = cleaned_line.replace('_', ' ').upper() | |
# Replace multiple spaces with a single space | |
cleaned_line = re.sub(r'\s+', ' ', cleaned_line).strip() | |
# Split into words | |
words = cleaned_line.split() | |
# Update the Counter with the words from this line | |
word_count.update(words) | |
# Function to load stopwords from a file | |
def load_stopwords(filename): | |
with open(filename, 'r') as file: | |
return set(word.strip().upper() for word in file) | |
# Load stopwords from file | |
stopwords = load_stopwords('title_parse_stopwords.txt') | |
# Filter out words that appear more than once, are longer than one character, and do not include '\\TRACKS\\' (note the upper case) | |
filtered_words = {word: count for word, count in word_count.items() if count > 1 | |
and len(word) > 2 | |
and '\\' not in word | |
and word not in stopwords | |
and not re.match(r'^\d+BPM$', word)} | |
# Sort the words by their frequency in descending order | |
sorted_filtered_words = sorted(filtered_words.items(), key=lambda item: item[1], reverse=True) | |
print(sorted_filtered_words.__len__()) | |
# Display filter words | |
sorted_filtered_words | |
# Save sorted_filtered_words to lexicon.txt | |
with open('lexicon.txt', 'w') as file: | |
for word, count in sorted_filtered_words: | |
file.write(f"{word}: {count}\n") | |
print("Lexicon has been saved to 'lexicon.txt'") | |
# Optional: Display the contents of the file | |
#print("\nContents of lexicon.txt:") | |
#with open('lexicon.txt', 'r') as file: | |
# print(file.read()) | |
from wordcloud import WordCloud | |
import matplotlib.pyplot as plt | |
# Convert the list of tuples to a dictionary | |
word_freq = dict(sorted_filtered_words) | |
# Create and generate a word cloud image | |
wordcloud = WordCloud(width=4096, max_words=600, height=2048, background_color='white').generate_from_frequencies(word_freq) | |
# Display the generated image | |
#plt.figure(figsize=(10, 5)) | |
#plt.imshow(wordcloud, interpolation='bilinear') | |
#plt.axis('off') | |
#plt.show() | |
# Save the word cloud as an image file | |
wordcloud.to_file("wordcloud.png") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2020 | |
ABM | |
AGAIN | |
ALL | |
ALWAYS | |
AND | |
APOCALYPTIC | |
ARE | |
BARRY | |
BENT | |
BOTTOM | |
BUILDS | |
CAN | |
CANDY | |
CATCH | |
CAR | |
COLD | |
COMPLETE | |
COOL | |
CUT | |
CUTS | |
DEEPER | |
DESTROYER | |
DETAIL | |
DONT | |
EDIT | |
FEAT | |
FEEL | |
FING | |
FIREFLIES | |
FOR | |
FOREVER | |
FRAGMENTS | |
GENERAL | |
GUEST | |
HAVE | |
HEAR | |
HEAT | |
HEBETUDE | |
HITHAT | |
HOME | |
HOW | |
JOHN | |
KIND | |
KNOW | |
LIFE | |
LIKE | |
MAKE | |
MIND | |
MORE | |
MORNING | |
MOTION | |
NAVSTA | |
NEW | |
NGOSI | |
NISTRUM | |
NOW | |
ONLY | |
ORB | |
OUR | |
PAX | |
ROD | |
SAID | |
SCARRZ | |
SECTION | |
SPAGHETTI | |
STUFF | |
SUBSYST | |
SUGAR | |
SUN | |
SUNCASTLE | |
SUNSHINE | |
SUPER | |
SUPPLY | |
TAKE | |
TEXTUREDGROOVES | |
THAT | |
THE | |
TILL | |
TIME | |
TWO | |
TWOBOB | |
TWOBOBS | |
VAST | |
VIBRO | |
WAY | |
WITH | |
WITHOUT | |
WHAT | |
WHEN | |
WHERE | |
WHO | |
#WORD | |
WORLD | |
YOU | |
YOUR | |
YOURE | |
ZENITH | |
# | |
END | |
CREEPY | |
CRYPT | |
DOUBLEKICK | |
PURE | |
WORD | |
THREEFOUR | |
EIGTH | |
PROG | |
THEATRICAL | |
MANGA | |
AUDIO | |
LISTENING | |
MACHINES | |
BAND | |
SPACES | |
BIG | |
BOUNCY | |
ROBOT | |
CUE | |
DRIVEN | |
TUNE | |
WASTED | |
OFFBEATS | |
CLOCKWORK | |
DRIFTING | |
TARANTINO | |
HIGHNOON | |
STANDOFF | |
STYLISED | |
DAWN | |
LOTR | |
RELIGIOUS | |
RESOLUTION | |
REMIXED | |
MURDER | |
BENDY | |
STAR | |
BLAZING | |
BLING | |
POETRY | |
BOSON | |
SPIN | |
ROOM | |
VIP | |
COLORFUL | |
SHINING | |
RAFIQI | |
SHORT | |
STATIS | |
SUBMARINE | |
DEVRAS | |
PLEXI | |
COMPRESS | |
DJINTRO | |
VIRTUSO | |
EVERYTHING | |
SONG | |
ANDREWS | |
BEEFHEART | |
SIGNALS | |
QUIVER | |
INDUCTIVEONE | |
LAWLER | |
NAVSTA | |
REMASTERED | |
NUMBERSINTHEDREAM | |
SPAGHETTIWESTERN | |
FORWARD | |
MAD | |
WAITING | |
TIRED |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
results
511 polytonic classes with frequency extracted from 472 wav titles containing tags