Last active
August 18, 2021 00:46
-
-
Save hikari-no-yume/7362f2b1d70e3d1250b07b09451f1c76 to your computer and use it in GitHub Desktop.
bruteforce mojibake decoder
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# as of python 3.9 | |
all_encodings = ['ascii', 'big5', 'big5hkscs', 'cp037', 'cp273', 'cp424', 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', 'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'cp932', 'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125', 'cp1140', 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256', 'cp1257', 'cp1258', 'euc_jp', 'euc_jis_2004', 'euc_jisx0213', 'euc_kr', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr', 'latin_1', 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7', 'iso8859_8', 'iso8859_9', 'iso8859_10', 'iso8859_11', 'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2', 'mac_roman', 'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004', 'shift_jisx0213', 'utf_32', 'utf_32_be', 'utf_32_le', 'utf_16', 'utf_16_be', 'utf_16_le', 'utf_7', 'utf_8', 'utf_8_sig'] | |
start = 'åÊ' | |
def insert(d, k, set_elem): | |
if k in d: | |
d[k].add(set_elem) | |
else: | |
d[k] = {set_elem} | |
stage1_encodings = dict() | |
stage2_encodings = dict() | |
stage3_encodings = dict() | |
stage4_encodings = set() | |
for encoding1 in all_encodings: | |
try: | |
stage1 = start.encode(encoding1) | |
except: | |
continue | |
insert(stage1_encodings, stage1, encoding1) | |
for encoding2 in all_encodings: | |
try: | |
stage2 = stage1.decode(encoding2) | |
except: | |
continue | |
insert(stage2_encodings, (stage1, stage2), encoding2) | |
for encoding3 in all_encodings: | |
try: | |
stage3 = stage2.encode(encoding3) | |
except: | |
continue | |
insert(stage3_encodings, (stage2, stage3), encoding3) | |
try: | |
stage4 = stage3.decode('utf-8') | |
except: | |
continue | |
if len(stage4) != 1: | |
continue | |
stage4_encodings.add((stage3, stage4)) | |
for stage3, result in stage4_encodings: | |
print(f"{result!r}") | |
print(f"|- UTF-8") | |
print(f" |- {stage3!r}") | |
for stage2_3, encoding3 in stage3_encodings.items(): | |
if stage2_3[1] == stage3: | |
stage2 = stage2_3[0] | |
print(f" |- {', '.join(encoding3)}") | |
print(f" |- {stage2!r}") | |
for stage1_2, encoding2 in stage2_encodings.items(): | |
if stage1_2[1] == stage2: | |
stage1 = stage1_2[0] | |
print(f" |- {', '.join(encoding2)}") | |
print(f" |- {stage1!r}") | |
encoding1 = stage1_encodings[stage1] | |
print(f" |- {', '.join(encoding1)}") | |
print(f" |- {start!r}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment