Skip to content

Instantly share code, notes, and snippets.

@nikopartanen
Last active July 27, 2020 18:31
Show Gist options
  • Save nikopartanen/40fdedfb4a3c1a0c01c130ca821da5e3 to your computer and use it in GitHub Desktop.
Save nikopartanen/40fdedfb4a3c1a0c01c130ca821da5e3 to your computer and use it in GitHub Desktop.
import xml.etree.cElementTree as ET
from PIL import Image
import numpy as np
from google.protobuf.json_format import MessageToJson
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from pathlib import Path
import sklearn as sk
from skimage import filters
from mikatools import *
# LAITA TÄHÄN KUVA JA ALTO XML
# output_dir on se mihin ne rivit menee
# kuvien id:t ei oo nyt uniikkeja jos on paljon kuvia eri tiedostoista
# tohon riville 106 pitäs lisätä et se ottaa myös noi line['left'] yms et tulee uniikkeja
image = load_image("brigadir/Brigadir_koi_1932_03_17_0001.jpg")
alto = read_alto("brigadir/Brigadir_koi_1932_03_17_0001.xml") # version = 3
output_dir = "uusi_esimerkki"
def load_image(path):
image = Image.open(path)
return(image)
def extract_line_array(pil_image, height, width, top, left):
cropped_example = pil_image.crop((int(left), int(top), int(left) + int(width), int(top) + int(height)))
cropped_example_bw = cropped_example.convert("L")
image_array = np.array(cropped_example_bw)
threshold_otsu = filters.threshold_otsu(image_array)
image_array_binarized = binarize_array(image_array, threshold_otsu)
return(image_array_binarized)
def binarize_array(numpy_array, threshold):
"""Binarize a numpy array."""
for i in range(len(numpy_array)):
for j in range(len(numpy_array[0])):
if numpy_array[i][j] > threshold:
numpy_array[i][j] = 255
else:
numpy_array[i][j] = 0
return(numpy_array)
def read_alto(alto_file, version = 2):
tree = ET.parse(alto_file)
root = tree.getroot()
xmlns = {'alto': '{http://www.loc.gov/standards/alto/ns-v' + str(version) + '#}'}
data = []
unit = root.find('.//{alto}MeasurementUnit'.format(**xmlns)).text
max_height = root.find('.//{alto}PrintSpace'.format(**xmlns)).get('HEIGHT')
max_width = root.find('.//{alto}PrintSpace'.format(**xmlns)).get('WIDTH')
for block in root.iterfind('.//{alto}TextBlock'.format(**xmlns)):
block_id = block.get('ID')
block_height = block.get('HEIGHT')
block_width = block.get('WIDTH')
block_top = block.get('VPOS')
block_left = block.get('HPOS')
for line in block.iterfind('.//{alto}TextLine'.format(**xmlns)):
content = {}
content["block_id"] = block_id
content["block_height"] = block_height
content["block_width"] = block_width
content["block_top"] = block_top
content["block_left"] = block_left
content["height"] = line.get('HEIGHT')
content["width"] = line.get('WIDTH')
content["top"] = line.get('VPOS')
content["left"] = line.get('HPOS')
content["unit"] = unit
content["max_height"] = max_height
content["max_width"] = max_width
content["file_path"] = str(alto_file)
line_strings = []
for string in line.findall('./{alto}String'.format(**xmlns)):
line_strings.append(string.get('CONTENT'))
content["text"] = ' '.join(line_strings)
data.append(content)
return(data)
for line in alto:
print(line)
line_array = extract_line_array(image, line['height'], line['width'], line['top'], line['left'])
output_path = f"{output_dir}/{line['block_id']}_{line['height']}_{line['width']}_{line['top']}_{line['left']}.png"
image_from_array = Image.fromarray(line_array)
image_from_array.save(output_path)
print(f"Saved {output_path}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment