Skip to content

Instantly share code, notes, and snippets.

@gullyn
Created January 17, 2021 12:48
Show Gist options
  • Save gullyn/991dbd5e74738d28a1ce6f918eaa6dab to your computer and use it in GitHub Desktop.
Save gullyn/991dbd5e74738d28a1ce6f918eaa6dab to your computer and use it in GitHub Desktop.
Wikipedia page lengths
import requests, json, re
def main():
states = [l.split(",") for l in open("states.txt", "r").read().split("\n")]
state_codes = {l.split(",")[0].lower(): l.split(",")[1].lower() for l in open("state_codes.csv", "r").read().split("\n")}
styling = ""
for state in states:
length = page_length(state[0], True)
print(f"{state[1].title()}: {length}")
styling += f".{state_codes[state[1]]} {{fill: {get_color(length)};}}\n"
print(styling)
def page_length(title, recursive=False):
req = requests.get(f"https://en.wikipedia.org/w/api.php?action=parse&page={title}&format=json")
content = json.loads(req.content)["parse"]["text"]["*"]
sum_len = len(content)
if not recursive:
return sum_len
links = get_links(content)
for link in links:
sum_len += page_length(link)
return sum_len
def get_links(content):
regex = r"Main articles?: (?:<a href=\"\/wiki\/(.+?)\".+?>.+?</a> ?a?n?d? ?)?(?:<a href=\"\/wiki\/(.+?)\".+?>.+?</a> ?a?n?d? ?){1,}"
matches = re.findall(regex, content)
new_matches = []
for match in matches:
for rm in match:
if len(rm) > 0:
new_matches.append(rm.split("#")[0])
return new_matches
def get_color(length):
ratio = max(min(length / 5800883, 1), 0)
r = round(255 - ratio * 128)
gb = round(255 - ratio * 255)
return f"rgba({r}, {gb}, {gb}, 1)"
if __name__ == "__main__":
main()
ALABAMA AL
ALASKA AK
ARIZONA AZ
ARKANSAS AR
CALIFORNIA CA
COLORADO CO
CONNECTICUT CT
DELAWARE DE
DISTRICT OF COLUMBIA DC
FLORIDA FL
GEORGIA GA
HAWAII HI
IDAHO ID
ILLINOIS IL
INDIANA IN
IOWA IA
KANSAS KS
KENTUCKY KY
LOUISIANA LA
MAINE ME
MARYLAND MD
MASSACHUSETTS MA
MICHIGAN MI
MINNESOTA MN
MISSISSIPPI MS
MISSOURI MO
MONTANA MT
NEBRASKA NE
NEVADA NV
NEW HAMPSHIRE NH
NEW JERSEY NJ
NEW MEXICO NM
NEW YORK NY
NORTH CAROLINA NC
NORTH DAKOTA ND
OHIO OH
OKLAHOMA OK
OREGON OR
PENNSYLVANIA PA
RHODE ISLAND RI
SOUTH CAROLINA SC
SOUTH DAKOTA SD
TENNESSEE TN
TEXAS TX
UTAH UT
VERMONT VT
VIRGINIA VA
WASHINGTON WA
WEST VIRGINIA WV
WISCONSIN WI
WYOMING WY
California,california
Texas,texas
Florida,florida
New_York_(state),new york
Pennsylvania,pennsylvania
Illinois,illinois
Ohio,ohio
Georgia_(U.S._state),georgia
North_Carolina,north carolina
Michigan,michigan
New_Jersey,new jersey
Virginia,virginia
Washington_(state),washington
Arizona,arizona
Massachusetts,massachusetts
Tennessee,tennessee
Indiana,indiana
Missouri,missouri
Maryland,maryland
Wisconsin,wisconsin
Colorado,colorado
Minnesota,minnesota
South_Carolina,south carolina
Alabama,alabama
Louisiana,louisiana
Kentucky,kentucky
Oregon,oregon
Oklahoma,oklahoma
Connecticut,connecticut
Utah,utah
Iowa,iowa
Nevada,nevada
Arkansas,arkansas
Mississippi,mississippi
Kansas,kansas
New_Mexico,new mexico
Nebraska,nebraska
Idaho,idaho
West_Virginia,west virginia
Hawaii,hawaii
New_Hampshire,new hampshire
Maine,maine
Montana,montana
Rhode_Island,rhode island
Delaware,delaware
South_Dakota,south dakota
North_Dakota,north dakota
Alaska,alaska
Vermont,vermont
Wyoming,wyoming
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment