Skip to content
Snippets Groups Projects
Commit 81cd912a authored by Alexander Hartelt's avatar Alexander Hartelt
Browse files

minor refactor

parent 0deef26c
Branches
No related tags found
No related merge requests found
data
*pycl
\ No newline at end of file
File added
File added
This diff is collapsed.
import os
from typing import Tuple, List
import math
import xml.etree.ElementTree as ET
from xml.dom import minidom
# this class is responsible for Converting a set of Text-Regions into a String or XML file
class XMLGenerator:
'''
Creates and saves textregions to xml file
'''
def __init__(self, imageWidth: int, imageHeight: int, imageFilename: str, baselines):
self.imageWidth = imageWidth
self.imageHeight = imageHeight
self.imageFilename = imageFilename
self.baselines = baselines
def baselines_to_xml_string(self) -> str:
'''
creates the xml to the given baselines
:return: xml-string of baselines
'''
xmlns_uris = {'pc': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15'}
attr_qname = ET.QName("http://www.w3.org/2001/XMLSchema-instance", "schemaLocation")
root_node = ET.Element("PcGts", {
attr_qname: "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd"})
for prefix, uri in xmlns_uris.items():
root_node.attrib['xmlns'] = uri
page_node = ET.SubElement(root_node, "Page", imageFilename=str(self.imageFilename),
imageWidth=str(self.imageWidth), imageHeight=str(self.imageHeight))
reading_order_node = ET.SubElement(page_node, "ReadingOrder")
ordered_group_node = ET.SubElement(reading_order_node, "OrderedGroup", id="ro1", caption="Region reading order")
for i in range(0, len(self.baselines)):
tr_node = ET.SubElement(page_node, "TextRegion", id="TextRegion" + str(i))
ET.SubElement(ordered_group_node, "RegionRefIndexed", index=str(i), regionRef="TextRegion" + str(i))
#ET.SubElement(tr_node, "Coords", points=self.coords_to_string(region.coords))
tl_node = ET.SubElement(tr_node, "TextLine", id="TextLine")
ET.SubElement(tl_node, "Baseline", points=self.coords_to_string(self.baselines[i]))
# annotate_with_XMLNS_prefixes(root_node, "pc", False)
return minidom.parseString(ET.tostring(root_node)).toprettyxml(indent=' ')
def coords_to_string(self, coords: List[Tuple[int, int]]) -> str:
'''
transforms int tuples to string for xml
:param coords: list of int tuples
:return: string of coords separated by whitespaces
'''
coordstring = ""
for coord in coords:
coordstring = coordstring + str(coord[0]) + "," + str(coord[1]) + " "
return coordstring[:-1]
def save_textregions_as_xml(self, output_path: str):
'''
Transform textregions to xml and save it to output_path
:param output_path:
'''
completeName = os.path.join(output_path, self.imageFilename + ".xml")
output_String = self.baselines_to_xml_string()
file = open(completeName, "w")
file.write(output_String)
file.close()
def annotate_with_XMLNS_prefixes(tree, xmlns_prefix, skip_root_node=True):
'''
annotates the xml with prefixes (like in the example of christoph)
:param tree:
:param xmlns_prefix:
:param skip_root_node:
:return:
'''
if not ET.iselement(tree):
tree = tree.getroot()
iterator = tree.iter()
if skip_root_node:
iterator.next()
for e in iterator:
if not ':' in e.tag:
e.tag = xmlns_prefix + ":" + e.tag
'''
Visvalingam-Whyatt method of poly-line vertex reduction
Visvalingam, M and Whyatt J D (1993)
"Line Generalisation by Repeated Elimination of Points", Cartographic J., 30 (1), 46 - 51
Described here:
http://web.archive.org/web/20100428020453/http://www2.dcs.hull.ac.uk/CISRG/publications/DPs/DP10/DP10.html
=========================================
The MIT License (MIT)
Copyright (c) 2014 Elliot Hallmark
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================
'''
from numpy import array, argmin
import numpy as np
def triangle_area(p1, p2, p3):
"""
calculates the area of a triangle given its vertices
"""
return abs(p1[0] * (p2[1] - p3[1]) + p2[0] * (p3[1] - p1[1]) + p3[0] * (p1[1] - p2[1])) / 2.
def triangle_areas_from_array(arr):
'''
take an (N,2) array of points and return an (N,1)
array of the areas of those triangles, where the first
and last areas are np.inf
see triangle_area for algorithm
'''
arr.astype(np.float64)
#print(arr.dtype)
result = np.empty((len(arr),), arr.dtype)
result[0] = np.inf
result[-1] = np.inf
p1 = arr[:-2]
p2 = arr[1:-1]
p3 = arr[2:]
# an accumulators to avoid unnecessary intermediate arrays
accr = result[1:-1] # Accumulate directly into result
acc1 = np.empty_like(accr)
np.subtract(p2[:, 1], p3[:, 1], out=accr)
np.multiply(p1[:, 0], accr, out=accr)
np.subtract(p3[:, 1], p1[:, 1], out=acc1)
np.multiply(p2[:, 0], acc1, out=acc1)
np.add(acc1, accr, out=accr)
np.subtract(p1[:, 1], p2[:, 1], out=acc1)
np.multiply(p3[:, 0], acc1, out=acc1)
np.add(acc1, accr, out=accr)
np.abs(accr, out=accr)
accr /= 2.
# Notice: accr was writing into result, so the answer is in there
return result
# the final value in thresholds is np.inf, which will never be
# the min value. So, I am safe in "deleting" an index by
# just shifting the array over on top of it
def remove(s, i):
'''
Quick trick to remove an item from a numpy array without
creating a new object. Rather than the array shape changing,
the final value just gets repeated to fill the space.
~3.5x faster than numpy.delete
'''
s[i:-1] = s[i + 1:]
class VWSimplifier(object):
def __init__(self, pts):
'''Initialize with points. takes some time to build
the thresholds but then all threshold filtering later
is ultra fast'''
self.pts = np.array(pts)
self.thresholds = self.build_thresholds()
self.ordered_thresholds = sorted(self.thresholds, reverse=True)
def build_thresholds(self):
'''compute the area value of each vertex, which one would
use to mask an array of points for any threshold value.
returns a numpy.array (length of pts) of the areas.
'''
pts = self.pts
nmax = len(pts)
real_areas = triangle_areas_from_array(pts)
real_indices = list(range(nmax))
# destructable copies
# ARG! areas=real_areas[:] doesn't make a copy!
areas = np.copy(real_areas)
i = real_indices[:]
# pick first point and set up for loop
min_vert = argmin(areas)
this_area = areas[min_vert]
# areas and i are modified for each point finished
remove(areas, min_vert) # faster
# areas = np.delete(areas,min_vert) #slower
real_idx = i.pop(min_vert)
# cntr = 3
while this_area < np.inf:
'''min_vert was removed from areas and i. Now,
adjust the adjacent areas and remove the new
min_vert.
Now that min_vert was filtered out, min_vert points
to the point after the deleted point.'''
skip = False # modified area may be the next minvert
try:
right_area = triangle_area(pts[i[min_vert - 1]],
pts[i[min_vert]], pts[i[min_vert + 1]])
except IndexError:
# trying to update area of endpoint. Don't do it
pass
else:
right_idx = i[min_vert]
if right_area <= this_area:
# even if the point now has a smaller area,
# it ultimately is not more significant than
# the last point, which needs to be removed
# first to justify removing this point.
# Though this point is the next most significant
right_area = this_area
# min_vert refers to the point to the right of
# the previous min_vert, so we can leave it
# unchanged if it is still the min_vert
skip = min_vert
# update both collections of areas
real_areas[right_idx] = right_area
areas[min_vert] = right_area
if min_vert > 1:
# cant try/except because 0-1=-1 is a valid index
left_area = triangle_area(pts[i[min_vert - 2]],
pts[i[min_vert - 1]], pts[i[min_vert]])
if left_area <= this_area:
# same justification as above
left_area = this_area
skip = min_vert - 1
real_areas[i[min_vert - 1]] = left_area
areas[min_vert - 1] = left_area
# only argmin if we have too.
min_vert = skip or argmin(areas)
real_idx = i.pop(min_vert)
this_area = areas[min_vert]
# areas = np.delete(areas,min_vert) #slower
remove(areas, min_vert) # faster
'''if sum(np.where(areas==np.inf)[0]) != sum(list(reversed(range(len(areas))))[:cntr]):
print "broke:",np.where(areas==np.inf)[0],cntr
break
cntr+=1
#if real_areas[0]<np.inf or real_areas[-1]<np.inf:
# print "NO!", real_areas[0], real_areas[-1]
'''
return real_areas
def from_threshold(self, threshold):
return self.pts[self.thresholds >= threshold]
def from_number(self, n):
thresholds = self.ordered_thresholds
try:
threshold = thresholds[int(n)]
except IndexError:
return self.pts
return self.pts[self.thresholds > threshold]
def from_ratio(self, r):
if r <= 0 or r > 1:
raise ValueError("Ratio must be 0<r<=1")
else:
return self.from_number(r * len(self.thresholds))
def fancy_parametric(k):
''' good k's: .33,.5,.65,.7,1.3,1.4,1.9,3,4,5'''
cos = np.cos
sin = np.sin
xt = lambda t: (k - 1) * cos(t) + cos(t * (k - 1))
yt = lambda t: (k - 1) * sin(t) - sin(t * (k - 1))
return xt, yt
if __name__ == "__main__":
from time import time
n = 5000
thetas = np.linspace(0, 16 * np.pi, n)
xt, yt = fancy_parametric(1.4)
pts = np.array([[xt(t), yt(t)] for t in thetas])
start = time()
simplifier = VWSimplifier(pts)
pts = simplifier.from_number(1000)
end = time()
print("%s vertices removed in %02f seconds" % (n - len(pts), end - start))
import matplotlib.pyplot as plot
plot.plot(pts[:, 0], pts[:, 1], color='r')
plot.show()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment