copumpkin/extract.py

## extract.py
#!/usr/bin/env python

from os import listdir
from os.path import isfile, join
import re
import json

from bs4 import BeautifulSoup

"""
Setup
-----

# Install libraries
pip install beautifulsoup4

# Download files
wget -r -np -k -A .html -nc https://docs.aws.amazon.com/IAM/latest/UserGuide/introduction.html
"""

def chomp(string):
    """This chomp cleans up all white-space, not just at the ends"""
    response = string.replace('\n', ' ') # Convert line ends to spaces
    response = re.sub(' [ ]*', ' ', response) # Truncate multiple spaces to single space
    response = re.sub('^[\W]*', '', response) # Clean start
    return  re.sub('[\W]*$', '', response) # Clean end

mypath = './docs.aws.amazon.com/IAM/latest/UserGuide/'
schema = []
for filename in [f for f in listdir(mypath) if isfile(join(mypath, f))]:
    if not filename.startswith("list_"):
        continue
    with open(mypath+filename, 'r') as f:
        soup = BeautifulSoup(f.read(), 'html.parser')
        main_content = soup.find(id="main-content")
        if main_content is None:
            continue

        # Get service name
        title = main_content.find('h1', class_="topictitle")
        title = re.sub('.*Actions, Resources, and Condition Keys for *', '', str(title))
        title = title.replace('</h1>', '')
        service_name = chomp(title)

        service_schema = {'service_name': service_name, 'privileges': [], 'resources': {}}

        tables = main_content.find_all('div', class_="table-contents")

        for table in tables:
            # There can be 3 tables, the actions table, an ARN table, and a condition key table
            # Example: https://docs.aws.amazon.com/IAM/latest/UserGuide/list_awssecuritytokenservice.html
            if '<th>Actions</th>' in [str(x) for x in table.find_all('th')]:
                for row in table.find_all('tr'):
                    cells = row.find_all('td')
                    if len(cells) == 0:
                        # Skip the header row, which has th, not td cells
                        continue

                    if len(cells) != 6:
                        # Sometimes the privilege might span multiple rows.
                        # Example: amazonroute53-DisassociateVPCFromHostedZone
                        # at https://docs.aws.amazon.com/IAM/latest/UserGuide/list_amazonroute53.html
                        # TODO: Handle this situation. Currently, I only use the first row
                        continue

                    priv = ''
                    # Get the privilege
                    for link in cells[0].find_all('a'):
                        if 'href' not in link.attrs:
                            # Skip the <a id='...'> tags
                            continue
                        priv = chomp(link.text)
                    if priv == '':
                        continue

                    description = chomp(cells[1].text)

                    access_level = chomp(cells[2].text)

                    resource_types = chomp(cells[3].text)

                    condition_keys_element = cells[4]
                    condition_keys = []
                    if condition_keys_element.text != '':
                        for key_element in condition_keys_element.find_all('p'):
                            condition_keys.append(chomp(key_element.text))

                    dependent_actions_element = cells[5]
                    dependent_actions = []
                    if dependent_actions_element.text != '':
                        for action_element in dependent_actions_element.find_all('p'):
                            dependent_actions.append(chomp(action_element.text))

                    privilege_schema = {
                        'privilege': priv,
                        'description': description,
                        'access_level': access_level,
                        'resource_types': resource_types,
                        'condition_keys': condition_keys,
                        'dependent_actions': dependent_actions
                    }

                    service_schema['privileges'].append(privilege_schema)
            elif '<th>Resource Types</th>' in [str(x) for x in table.find_all('th')]:
                for row in table.find_all('tr'):
                    cells = row.find_all('td')

                    if len(cells) == 0:
                        continue

                    name = ''
                    for link in cells[0].find_all('a'):
                        if 'href' not in link.attrs:
                            continue

                        name = link.text.strip()
                    if name == '':
                        continue

                    condition_keys_element = cells[2]
                    condition_keys = []
                    if condition_keys_element.text != '':
                        for key_element in condition_keys_element.find_all('p'):
                            condition_keys.append(chomp(key_element.text))

                    resource_schema = {
                        'pattern': cells[1].text.strip(),
                        'condition_keys': condition_keys
                    }

                    service_schema['resources'][name] = resource_schema
        schema.append(service_schema)

print(json.dumps(schema))
	#!/usr/bin/env python

	from os import listdir
	from os.path import isfile, join
	import re
	import json

	from bs4 import BeautifulSoup

	"""
	Setup
	-----

	# Install libraries
	pip install beautifulsoup4

	# Download files
	wget -r -np -k -A .html -nc https://docs.aws.amazon.com/IAM/latest/UserGuide/introduction.html
	"""

	def chomp(string):
	"""This chomp cleans up all white-space, not just at the ends"""
	response = string.replace('\n', ' ') # Convert line ends to spaces
	response = re.sub(' [ ]*', ' ', response) # Truncate multiple spaces to single space
	response = re.sub('^[\W]*', '', response) # Clean start
	return re.sub('[\W]*$', '', response) # Clean end

	mypath = './docs.aws.amazon.com/IAM/latest/UserGuide/'
	schema = []
	for filename in [f for f in listdir(mypath) if isfile(join(mypath, f))]:
	if not filename.startswith("list_"):
	continue
	with open(mypath+filename, 'r') as f:
	soup = BeautifulSoup(f.read(), 'html.parser')
	main_content = soup.find(id="main-content")
	if main_content is None:
	continue

	# Get service name
	title = main_content.find('h1', class_="topictitle")
	title = re.sub('.Actions, Resources, and Condition Keys for ', '', str(title))
	title = title.replace('</h1>', '')
	service_name = chomp(title)

	service_schema = {'service_name': service_name, 'privileges': [], 'resources': {}}

	tables = main_content.find_all('div', class_="table-contents")

	for table in tables:
	# There can be 3 tables, the actions table, an ARN table, and a condition key table
	# Example: https://docs.aws.amazon.com/IAM/latest/UserGuide/list_awssecuritytokenservice.html
	if '<th>Actions</th>' in [str(x) for x in table.find_all('th')]:
	for row in table.find_all('tr'):
	cells = row.find_all('td')
	if len(cells) == 0:
	# Skip the header row, which has th, not td cells
	continue

	if len(cells) != 6:
	# Sometimes the privilege might span multiple rows.
	# Example: amazonroute53-DisassociateVPCFromHostedZone
	# at https://docs.aws.amazon.com/IAM/latest/UserGuide/list_amazonroute53.html
	# TODO: Handle this situation. Currently, I only use the first row
	continue

	priv = ''
	# Get the privilege
	for link in cells[0].find_all('a'):
	if 'href' not in link.attrs:
	# Skip the <a id='...'> tags
	continue
	priv = chomp(link.text)
	if priv == '':
	continue

	description = chomp(cells[1].text)

	access_level = chomp(cells[2].text)

	resource_types = chomp(cells[3].text)

	condition_keys_element = cells[4]
	condition_keys = []
	if condition_keys_element.text != '':
	for key_element in condition_keys_element.find_all('p'):
	condition_keys.append(chomp(key_element.text))

	dependent_actions_element = cells[5]
	dependent_actions = []
	if dependent_actions_element.text != '':
	for action_element in dependent_actions_element.find_all('p'):
	dependent_actions.append(chomp(action_element.text))

	privilege_schema = {
	'privilege': priv,
	'description': description,
	'access_level': access_level,
	'resource_types': resource_types,
	'condition_keys': condition_keys,
	'dependent_actions': dependent_actions
	}

	service_schema['privileges'].append(privilege_schema)
	elif '<th>Resource Types</th>' in [str(x) for x in table.find_all('th')]:
	for row in table.find_all('tr'):
	cells = row.find_all('td')

	if len(cells) == 0:
	continue

	name = ''
	for link in cells[0].find_all('a'):
	if 'href' not in link.attrs:
	continue

	name = link.text.strip()
	if name == '':
	continue

	condition_keys_element = cells[2]
	condition_keys = []
	if condition_keys_element.text != '':
	for key_element in condition_keys_element.find_all('p'):
	condition_keys.append(chomp(key_element.text))

	resource_schema = {
	'pattern': cells[1].text.strip(),
	'condition_keys': condition_keys
	}

	service_schema['resources'][name] = resource_schema
	schema.append(service_schema)

	print(json.dumps(schema))