BaksiLi/eodict.py

## eodict.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# version: 0.1

import requests
from bs4 import BeautifulSoup
import pandas as pd

def download_from_web(url: str) -> str:
    '''
    Download dictionaries from web pages.
    '''

    response = requests.get(url)
    response.encoding = response.apparent_encoding
    if response.status_code == 200:
        return response.text

def retrieve_entries() -> (dict, dict, dict):
    '''
    Retrieve dictionaries from their links, and output as a tuple of three dicts.
    '''
    webpages = {
        'Esperanto-English Dictionary':  #
            'https://www.esperanto-panorama.net/vortaro/eo-en-u.htm',
        'エスペラント日本語基本辞書':
            'https://vastalto.com/kagi/zisyo.html#LINIO_CX',
        'Etymological Dictionary of the Esperanto Language':  # Etimologia Vortareto
            'https://web.archive.org/web/20100106014600/http://etymological.freeweb.hu/Esperanto.htm'
    }
    eo_alphabet = list('abcĉdefgĝhĥijĵklmnoprsŝtuŭvz-') \
              + list('ABCĈDEFGĜHĤIJĴKLMNOPSŜTUŬVZ')

    dict1_entries, dict2_entries, dict3_entries = {}, {}, {}

    # for 'Esperanto-English Dictionary'
    # data stored as plain texts
    dict1_name = 'Esperanto-English Dictionary'
    dict1_lines = download_from_web(
            url=webpages[dict1_name]).split('\n')

    for line in dict1_lines[39: 15411]:
        # if the first char is in the alphabet
        if line[0] in eo_alphabet:
            # unpack line contents
            line_contents = line.split()
            vorto = line_contents[0]
            # defino = [i.strip(',') for i in line_contents[1:]]
            defino = ' '.join(line_contents[1:])
            # store entry
            dict1_entries[vorto] = defino
    # print(dict1_entries)

    # for 'エスペラント日本語基本辞書'
    # data stored in <table>
    dict2_name = 'エスペラント日本語基本辞書'
    soup2 = BeautifulSoup(download_from_web(url=webpages[dict2_name]), 'html.parser')
    table = soup2.findAll('table')[1].findAll('tr')  # the second table is the entries

    for i in table[1:-1]:
        line_contents = i.get_text().split()
        vorto = line_contents[0].replace('/','')
        defino = line_contents[-1]
        dict2_entries[vorto] = defino
    # print(dict2_entries)

    # for 'Etymological Dictionary of the Esperanto Language'
    # data stored in <p>
    dict3_name = 'Etymological Dictionary of the Esperanto Language'
    soup3 = BeautifulSoup(download_from_web(url=webpages[dict3_name]), 'html.parser')

    table = soup3.findAll('p', attrs={'class': 'MsoNormal'})
    for i in table[5:]:
        line_contents = i.get_text().split('\n')
        vorto = line_contents[0]
        defino = ''.join(line_contents[1:])
        dict3_entries[vorto] = defino
    # print(dict3_entries)

    # print(set(dict1_entries.keys()) & set(dict3_entries.keys()) & set(dict2_entries.keys()))
    return(dict1_entries,dict2_entries,dict3_entries)


if __name__ == '__main__':
    (d1, d2, d3) = retrieve_entries()

    pd1 = pd.DataFrame.from_dict(d1, orient='index', columns=['Eo-E'])
    pd2 = pd.DataFrame.from_dict(d2, orient='index', columns=['Eo-Jp'])
    pd3 = pd.DataFrame.from_dict(d3, orient='index', columns=['Etymology'])

    # TODO: combine using lowercases
    pd_combined = pd1.join(pd2, how='left').join(pd3, how='left')

    pd_combined.to_csv('./dict.csv', index=True, header=False)
	#!/usr/bin/env python
	# -- coding: utf-8 --
	# version: 0.1

	import requests
	from bs4 import BeautifulSoup
	import pandas as pd

	def download_from_web(url: str) -> str:
	'''
	Download dictionaries from web pages.
	'''

	response = requests.get(url)
	response.encoding = response.apparent_encoding
	if response.status_code == 200:
	return response.text

	def retrieve_entries() -> (dict, dict, dict):
	'''
	Retrieve dictionaries from their links, and output as a tuple of three dicts.
	'''
	webpages = {
	'Esperanto-English Dictionary': #
	'https://www.esperanto-panorama.net/vortaro/eo-en-u.htm',
	'エスペラント日本語基本辞書':
	'https://vastalto.com/kagi/zisyo.html#LINIO_CX',
	'Etymological Dictionary of the Esperanto Language': # Etimologia Vortareto
	'https://web.archive.org/web/20100106014600/http://etymological.freeweb.hu/Esperanto.htm'
	}
	eo_alphabet = list('abcĉdefgĝhĥijĵklmnoprsŝtuŭvz-') \
	+ list('ABCĈDEFGĜHĤIJĴKLMNOPSŜTUŬVZ')

	dict1_entries, dict2_entries, dict3_entries = {}, {}, {}

	# for 'Esperanto-English Dictionary'
	# data stored as plain texts
	dict1_name = 'Esperanto-English Dictionary'
	dict1_lines = download_from_web(
	url=webpages[dict1_name]).split('\n')

	for line in dict1_lines[39: 15411]:
	# if the first char is in the alphabet
	if line[0] in eo_alphabet:
	# unpack line contents
	line_contents = line.split()
	vorto = line_contents[0]
	# defino = [i.strip(',') for i in line_contents[1:]]
	defino = ' '.join(line_contents[1:])
	# store entry
	dict1_entries[vorto] = defino
	# print(dict1_entries)

	# for 'エスペラント日本語基本辞書'
	# data stored in <table>
	dict2_name = 'エスペラント日本語基本辞書'
	soup2 = BeautifulSoup(download_from_web(url=webpages[dict2_name]), 'html.parser')
	table = soup2.findAll('table')[1].findAll('tr') # the second table is the entries

	for i in table[1:-1]:
	line_contents = i.get_text().split()
	vorto = line_contents[0].replace('/','')
	defino = line_contents[-1]
	dict2_entries[vorto] = defino
	# print(dict2_entries)

	# for 'Etymological Dictionary of the Esperanto Language'
	# data stored in <p>
	dict3_name = 'Etymological Dictionary of the Esperanto Language'
	soup3 = BeautifulSoup(download_from_web(url=webpages[dict3_name]), 'html.parser')

	table = soup3.findAll('p', attrs={'class': 'MsoNormal'})
	for i in table[5:]:
	line_contents = i.get_text().split('\n')
	vorto = line_contents[0]
	defino = ''.join(line_contents[1:])
	dict3_entries[vorto] = defino
	# print(dict3_entries)

	# print(set(dict1_entries.keys()) & set(dict3_entries.keys()) & set(dict2_entries.keys()))
	return(dict1_entries,dict2_entries,dict3_entries)



	if __name__ == '__main__':
	(d1, d2, d3) = retrieve_entries()

	pd1 = pd.DataFrame.from_dict(d1, orient='index', columns=['Eo-E'])
	pd2 = pd.DataFrame.from_dict(d2, orient='index', columns=['Eo-Jp'])
	pd3 = pd.DataFrame.from_dict(d3, orient='index', columns=['Etymology'])

	# TODO: combine using lowercases
	pd_combined = pd1.join(pd2, how='left').join(pd3, how='left')

	pd_combined.to_csv('./dict.csv', index=True, header=False)