-
-
Save copumpkin/092b0664fb34b9d072c1451c28406a94 to your computer and use it in GitHub Desktop.
Parse IAM info from https://docs.aws.amazon.com/IAM/latest/UserGuide/list_awsservicecatalog.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from os import listdir | |
from os.path import isfile, join | |
import re | |
import json | |
from bs4 import BeautifulSoup | |
""" | |
Setup | |
----- | |
# Install libraries | |
pip install beautifulsoup4 | |
# Download files | |
wget -r -np -k -A .html -nc https://docs.aws.amazon.com/IAM/latest/UserGuide/introduction.html | |
""" | |
def chomp(string): | |
"""This chomp cleans up all white-space, not just at the ends""" | |
response = string.replace('\n', ' ') # Convert line ends to spaces | |
response = re.sub(' [ ]*', ' ', response) # Truncate multiple spaces to single space | |
response = re.sub('^[\W]*', '', response) # Clean start | |
return re.sub('[\W]*$', '', response) # Clean end | |
mypath = './docs.aws.amazon.com/IAM/latest/UserGuide/' | |
schema = [] | |
for filename in [f for f in listdir(mypath) if isfile(join(mypath, f))]: | |
if not filename.startswith("list_"): | |
continue | |
with open(mypath+filename, 'r') as f: | |
soup = BeautifulSoup(f.read(), 'html.parser') | |
main_content = soup.find(id="main-content") | |
if main_content is None: | |
continue | |
# Get service name | |
title = main_content.find('h1', class_="topictitle") | |
title = re.sub('.*Actions, Resources, and Condition Keys for *', '', str(title)) | |
title = title.replace('</h1>', '') | |
service_name = chomp(title) | |
service_schema = {'service_name': service_name, 'privileges': [], 'resources': {}} | |
tables = main_content.find_all('div', class_="table-contents") | |
for table in tables: | |
# There can be 3 tables, the actions table, an ARN table, and a condition key table | |
# Example: https://docs.aws.amazon.com/IAM/latest/UserGuide/list_awssecuritytokenservice.html | |
if '<th>Actions</th>' in [str(x) for x in table.find_all('th')]: | |
for row in table.find_all('tr'): | |
cells = row.find_all('td') | |
if len(cells) == 0: | |
# Skip the header row, which has th, not td cells | |
continue | |
if len(cells) != 6: | |
# Sometimes the privilege might span multiple rows. | |
# Example: amazonroute53-DisassociateVPCFromHostedZone | |
# at https://docs.aws.amazon.com/IAM/latest/UserGuide/list_amazonroute53.html | |
# TODO: Handle this situation. Currently, I only use the first row | |
continue | |
priv = '' | |
# Get the privilege | |
for link in cells[0].find_all('a'): | |
if 'href' not in link.attrs: | |
# Skip the <a id='...'> tags | |
continue | |
priv = chomp(link.text) | |
if priv == '': | |
continue | |
description = chomp(cells[1].text) | |
access_level = chomp(cells[2].text) | |
resource_types = chomp(cells[3].text) | |
condition_keys_element = cells[4] | |
condition_keys = [] | |
if condition_keys_element.text != '': | |
for key_element in condition_keys_element.find_all('p'): | |
condition_keys.append(chomp(key_element.text)) | |
dependent_actions_element = cells[5] | |
dependent_actions = [] | |
if dependent_actions_element.text != '': | |
for action_element in dependent_actions_element.find_all('p'): | |
dependent_actions.append(chomp(action_element.text)) | |
privilege_schema = { | |
'privilege': priv, | |
'description': description, | |
'access_level': access_level, | |
'resource_types': resource_types, | |
'condition_keys': condition_keys, | |
'dependent_actions': dependent_actions | |
} | |
service_schema['privileges'].append(privilege_schema) | |
elif '<th>Resource Types</th>' in [str(x) for x in table.find_all('th')]: | |
for row in table.find_all('tr'): | |
cells = row.find_all('td') | |
if len(cells) == 0: | |
continue | |
name = '' | |
for link in cells[0].find_all('a'): | |
if 'href' not in link.attrs: | |
continue | |
name = link.text.strip() | |
if name == '': | |
continue | |
condition_keys_element = cells[2] | |
condition_keys = [] | |
if condition_keys_element.text != '': | |
for key_element in condition_keys_element.find_all('p'): | |
condition_keys.append(chomp(key_element.text)) | |
resource_schema = { | |
'pattern': cells[1].text.strip(), | |
'condition_keys': condition_keys | |
} | |
service_schema['resources'][name] = resource_schema | |
schema.append(service_schema) | |
print(json.dumps(schema)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment