Last active
July 27, 2020 18:31
-
-
Save nikopartanen/40fdedfb4a3c1a0c01c130ca821da5e3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.etree.cElementTree as ET | |
from PIL import Image | |
import numpy as np | |
from google.protobuf.json_format import MessageToJson | |
import matplotlib.pyplot as plt | |
import matplotlib.patches as patches | |
from pathlib import Path | |
import sklearn as sk | |
from skimage import filters | |
from mikatools import * | |
# LAITA TÄHÄN KUVA JA ALTO XML | |
# output_dir on se mihin ne rivit menee | |
# kuvien id:t ei oo nyt uniikkeja jos on paljon kuvia eri tiedostoista | |
# tohon riville 106 pitäs lisätä et se ottaa myös noi line['left'] yms et tulee uniikkeja | |
image = load_image("brigadir/Brigadir_koi_1932_03_17_0001.jpg") | |
alto = read_alto("brigadir/Brigadir_koi_1932_03_17_0001.xml") # version = 3 | |
output_dir = "uusi_esimerkki" | |
def load_image(path): | |
image = Image.open(path) | |
return(image) | |
def extract_line_array(pil_image, height, width, top, left): | |
cropped_example = pil_image.crop((int(left), int(top), int(left) + int(width), int(top) + int(height))) | |
cropped_example_bw = cropped_example.convert("L") | |
image_array = np.array(cropped_example_bw) | |
threshold_otsu = filters.threshold_otsu(image_array) | |
image_array_binarized = binarize_array(image_array, threshold_otsu) | |
return(image_array_binarized) | |
def binarize_array(numpy_array, threshold): | |
"""Binarize a numpy array.""" | |
for i in range(len(numpy_array)): | |
for j in range(len(numpy_array[0])): | |
if numpy_array[i][j] > threshold: | |
numpy_array[i][j] = 255 | |
else: | |
numpy_array[i][j] = 0 | |
return(numpy_array) | |
def read_alto(alto_file, version = 2): | |
tree = ET.parse(alto_file) | |
root = tree.getroot() | |
xmlns = {'alto': '{http://www.loc.gov/standards/alto/ns-v' + str(version) + '#}'} | |
data = [] | |
unit = root.find('.//{alto}MeasurementUnit'.format(**xmlns)).text | |
max_height = root.find('.//{alto}PrintSpace'.format(**xmlns)).get('HEIGHT') | |
max_width = root.find('.//{alto}PrintSpace'.format(**xmlns)).get('WIDTH') | |
for block in root.iterfind('.//{alto}TextBlock'.format(**xmlns)): | |
block_id = block.get('ID') | |
block_height = block.get('HEIGHT') | |
block_width = block.get('WIDTH') | |
block_top = block.get('VPOS') | |
block_left = block.get('HPOS') | |
for line in block.iterfind('.//{alto}TextLine'.format(**xmlns)): | |
content = {} | |
content["block_id"] = block_id | |
content["block_height"] = block_height | |
content["block_width"] = block_width | |
content["block_top"] = block_top | |
content["block_left"] = block_left | |
content["height"] = line.get('HEIGHT') | |
content["width"] = line.get('WIDTH') | |
content["top"] = line.get('VPOS') | |
content["left"] = line.get('HPOS') | |
content["unit"] = unit | |
content["max_height"] = max_height | |
content["max_width"] = max_width | |
content["file_path"] = str(alto_file) | |
line_strings = [] | |
for string in line.findall('./{alto}String'.format(**xmlns)): | |
line_strings.append(string.get('CONTENT')) | |
content["text"] = ' '.join(line_strings) | |
data.append(content) | |
return(data) | |
for line in alto: | |
print(line) | |
line_array = extract_line_array(image, line['height'], line['width'], line['top'], line['left']) | |
output_path = f"{output_dir}/{line['block_id']}_{line['height']}_{line['width']}_{line['top']}_{line['left']}.png" | |
image_from_array = Image.fromarray(line_array) | |
image_from_array.save(output_path) | |
print(f"Saved {output_path}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment