Skip to content

Instantly share code, notes, and snippets.

@copumpkin
Forked from 0xdabbad00/extract.py
Last active May 2, 2022 11:15
Show Gist options
  • Save copumpkin/092b0664fb34b9d072c1451c28406a94 to your computer and use it in GitHub Desktop.
Save copumpkin/092b0664fb34b9d072c1451c28406a94 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
from os import listdir
from os.path import isfile, join
import re
import json
from bs4 import BeautifulSoup
"""
Setup
-----
# Install libraries
pip install beautifulsoup4
# Download files
wget -r -np -k -A .html -nc https://docs.aws.amazon.com/IAM/latest/UserGuide/introduction.html
"""
def chomp(string):
"""This chomp cleans up all white-space, not just at the ends"""
response = string.replace('\n', ' ') # Convert line ends to spaces
response = re.sub(' [ ]*', ' ', response) # Truncate multiple spaces to single space
response = re.sub('^[\W]*', '', response) # Clean start
return re.sub('[\W]*$', '', response) # Clean end
mypath = './docs.aws.amazon.com/IAM/latest/UserGuide/'
schema = []
for filename in [f for f in listdir(mypath) if isfile(join(mypath, f))]:
if not filename.startswith("list_"):
continue
with open(mypath+filename, 'r') as f:
soup = BeautifulSoup(f.read(), 'html.parser')
main_content = soup.find(id="main-content")
if main_content is None:
continue
# Get service name
title = main_content.find('h1', class_="topictitle")
title = re.sub('.*Actions, Resources, and Condition Keys for *', '', str(title))
title = title.replace('</h1>', '')
service_name = chomp(title)
service_schema = {'service_name': service_name, 'privileges': [], 'resources': {}}
tables = main_content.find_all('div', class_="table-contents")
for table in tables:
# There can be 3 tables, the actions table, an ARN table, and a condition key table
# Example: https://docs.aws.amazon.com/IAM/latest/UserGuide/list_awssecuritytokenservice.html
if '<th>Actions</th>' in [str(x) for x in table.find_all('th')]:
for row in table.find_all('tr'):
cells = row.find_all('td')
if len(cells) == 0:
# Skip the header row, which has th, not td cells
continue
if len(cells) != 6:
# Sometimes the privilege might span multiple rows.
# Example: amazonroute53-DisassociateVPCFromHostedZone
# at https://docs.aws.amazon.com/IAM/latest/UserGuide/list_amazonroute53.html
# TODO: Handle this situation. Currently, I only use the first row
continue
priv = ''
# Get the privilege
for link in cells[0].find_all('a'):
if 'href' not in link.attrs:
# Skip the <a id='...'> tags
continue
priv = chomp(link.text)
if priv == '':
continue
description = chomp(cells[1].text)
access_level = chomp(cells[2].text)
resource_types = chomp(cells[3].text)
condition_keys_element = cells[4]
condition_keys = []
if condition_keys_element.text != '':
for key_element in condition_keys_element.find_all('p'):
condition_keys.append(chomp(key_element.text))
dependent_actions_element = cells[5]
dependent_actions = []
if dependent_actions_element.text != '':
for action_element in dependent_actions_element.find_all('p'):
dependent_actions.append(chomp(action_element.text))
privilege_schema = {
'privilege': priv,
'description': description,
'access_level': access_level,
'resource_types': resource_types,
'condition_keys': condition_keys,
'dependent_actions': dependent_actions
}
service_schema['privileges'].append(privilege_schema)
elif '<th>Resource Types</th>' in [str(x) for x in table.find_all('th')]:
for row in table.find_all('tr'):
cells = row.find_all('td')
if len(cells) == 0:
continue
name = ''
for link in cells[0].find_all('a'):
if 'href' not in link.attrs:
continue
name = link.text.strip()
if name == '':
continue
condition_keys_element = cells[2]
condition_keys = []
if condition_keys_element.text != '':
for key_element in condition_keys_element.find_all('p'):
condition_keys.append(chomp(key_element.text))
resource_schema = {
'pattern': cells[1].text.strip(),
'condition_keys': condition_keys
}
service_schema['resources'][name] = resource_schema
schema.append(service_schema)
print(json.dumps(schema))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment