nikopartanen/alto2lines.py

## alto2lines.py
import xml.etree.cElementTree as ET

from PIL import Image
import numpy as np
from google.protobuf.json_format import MessageToJson

import matplotlib.pyplot as plt
import matplotlib.patches as patches
from pathlib import Path
import sklearn as sk
from skimage import filters

from mikatools import *

# LAITA TÄHÄN KUVA JA ALTO XML
# output_dir on se mihin ne rivit menee
# kuvien id:t ei oo nyt uniikkeja jos on paljon kuvia eri tiedostoista
# tohon riville 106 pitäs lisätä et se ottaa myös noi line['left'] yms et tulee uniikkeja

image = load_image("brigadir/Brigadir_koi_1932_03_17_0001.jpg")
alto = read_alto("brigadir/Brigadir_koi_1932_03_17_0001.xml") # version = 3
output_dir = "uusi_esimerkki"

def load_image(path):
    image = Image.open(path)
    return(image)

def extract_line_array(pil_image, height, width, top, left):

    cropped_example = pil_image.crop((int(left), int(top), int(left) + int(width), int(top) + int(height)))

    cropped_example_bw = cropped_example.convert("L")

    image_array = np.array(cropped_example_bw)

    threshold_otsu = filters.threshold_otsu(image_array)

    image_array_binarized = binarize_array(image_array, threshold_otsu)

    return(image_array_binarized)

def binarize_array(numpy_array, threshold):
    """Binarize a numpy array."""
    for i in range(len(numpy_array)):
        for j in range(len(numpy_array[0])):
            if numpy_array[i][j] > threshold:
                numpy_array[i][j] = 255
            else:
                numpy_array[i][j] = 0
    return(numpy_array)

def read_alto(alto_file, version = 2):

    tree = ET.parse(alto_file)
    root = tree.getroot()

    xmlns = {'alto': '{http://www.loc.gov/standards/alto/ns-v' + str(version) + '#}'}

    data = []

    unit = root.find('.//{alto}MeasurementUnit'.format(**xmlns)).text

    max_height = root.find('.//{alto}PrintSpace'.format(**xmlns)).get('HEIGHT')
    max_width = root.find('.//{alto}PrintSpace'.format(**xmlns)).get('WIDTH')

    for block in root.iterfind('.//{alto}TextBlock'.format(**xmlns)):

        block_id = block.get('ID')
        block_height = block.get('HEIGHT')
        block_width = block.get('WIDTH')
        block_top = block.get('VPOS')
        block_left = block.get('HPOS')

        for line in block.iterfind('.//{alto}TextLine'.format(**xmlns)):

            content = {}

            content["block_id"] = block_id
            content["block_height"] = block_height
            content["block_width"] = block_width
            content["block_top"] = block_top
            content["block_left"] = block_left
            content["height"] = line.get('HEIGHT')
            content["width"] = line.get('WIDTH')
            content["top"] = line.get('VPOS')
            content["left"] = line.get('HPOS')
            content["unit"] = unit
            content["max_height"] = max_height
            content["max_width"] = max_width
            content["file_path"] = str(alto_file)

            line_strings = []
            for string in line.findall('./{alto}String'.format(**xmlns)):
                line_strings.append(string.get('CONTENT'))
            content["text"] = ' '.join(line_strings)

            data.append(content)

    return(data)

for line in alto:

    print(line)

    line_array = extract_line_array(image,  line['height'], line['width'], line['top'], line['left'])

    output_path = f"{output_dir}/{line['block_id']}_{line['height']}_{line['width']}_{line['top']}_{line['left']}.png"

    image_from_array = Image.fromarray(line_array)
    image_from_array.save(output_path)
    print(f"Saved {output_path}")
	import xml.etree.cElementTree as ET

	from PIL import Image
	import numpy as np
	from google.protobuf.json_format import MessageToJson

	import matplotlib.pyplot as plt
	import matplotlib.patches as patches
	from pathlib import Path
	import sklearn as sk
	from skimage import filters

	from mikatools import *

	# LAITA TÄHÄN KUVA JA ALTO XML
	# output_dir on se mihin ne rivit menee
	# kuvien id:t ei oo nyt uniikkeja jos on paljon kuvia eri tiedostoista
	# tohon riville 106 pitäs lisätä et se ottaa myös noi line['left'] yms et tulee uniikkeja

	image = load_image("brigadir/Brigadir_koi_1932_03_17_0001.jpg")
	alto = read_alto("brigadir/Brigadir_koi_1932_03_17_0001.xml") # version = 3
	output_dir = "uusi_esimerkki"

	def load_image(path):
	image = Image.open(path)
	return(image)

	def extract_line_array(pil_image, height, width, top, left):

	cropped_example = pil_image.crop((int(left), int(top), int(left) + int(width), int(top) + int(height)))

	cropped_example_bw = cropped_example.convert("L")

	image_array = np.array(cropped_example_bw)

	threshold_otsu = filters.threshold_otsu(image_array)

	image_array_binarized = binarize_array(image_array, threshold_otsu)

	return(image_array_binarized)

	def binarize_array(numpy_array, threshold):
	"""Binarize a numpy array."""
	for i in range(len(numpy_array)):
	for j in range(len(numpy_array[0])):
	if numpy_array[i][j] > threshold:
	numpy_array[i][j] = 255
	else:
	numpy_array[i][j] = 0
	return(numpy_array)

	def read_alto(alto_file, version = 2):

	tree = ET.parse(alto_file)
	root = tree.getroot()

	xmlns = {'alto': '{http://www.loc.gov/standards/alto/ns-v' + str(version) + '#}'}

	data = []

	unit = root.find('.//{alto}MeasurementUnit'.format(**xmlns)).text

	max_height = root.find('.//{alto}PrintSpace'.format(**xmlns)).get('HEIGHT')
	max_width = root.find('.//{alto}PrintSpace'.format(**xmlns)).get('WIDTH')

	for block in root.iterfind('.//{alto}TextBlock'.format(**xmlns)):

	block_id = block.get('ID')
	block_height = block.get('HEIGHT')
	block_width = block.get('WIDTH')
	block_top = block.get('VPOS')
	block_left = block.get('HPOS')

	for line in block.iterfind('.//{alto}TextLine'.format(**xmlns)):

	content = {}

	content["block_id"] = block_id
	content["block_height"] = block_height
	content["block_width"] = block_width
	content["block_top"] = block_top
	content["block_left"] = block_left
	content["height"] = line.get('HEIGHT')
	content["width"] = line.get('WIDTH')
	content["top"] = line.get('VPOS')
	content["left"] = line.get('HPOS')
	content["unit"] = unit
	content["max_height"] = max_height
	content["max_width"] = max_width
	content["file_path"] = str(alto_file)

	line_strings = []
	for string in line.findall('./{alto}String'.format(**xmlns)):
	line_strings.append(string.get('CONTENT'))
	content["text"] = ' '.join(line_strings)

	data.append(content)

	return(data)

	for line in alto:

	print(line)

	line_array = extract_line_array(image, line['height'], line['width'], line['top'], line['left'])

	output_path = f"{output_dir}/{line['block_id']}_{line['height']}_{line['width']}_{line['top']}_{line['left']}.png"

	image_from_array = Image.fromarray(line_array)
	image_from_array.save(output_path)
	print(f"Saved {output_path}")