Skip to content

Commit

Permalink
[builder] Add objectness scores (#1625)
Browse files Browse the repository at this point in the history
  • Loading branch information
felixdittrich92 authored Jun 7, 2024
1 parent 674a875 commit b9c4e6d
Show file tree
Hide file tree
Showing 23 changed files with 247 additions and 50 deletions.
4 changes: 4 additions & 0 deletions api/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ should yield
0.9101580212741838,
0.2080078125
],
"objectness_score": 0.5,
"lines": [
{
"geometry": [
Expand All @@ -155,6 +156,7 @@ should yield
0.9101580212741838,
0.2080078125
],
"objectness_score": 0.5,
"words": [
{
"value": "Hello",
Expand All @@ -164,6 +166,7 @@ should yield
0.8272978149561669,
0.20703125
],
"objectness_score": 0.5,
"confidence": 1.0,
"crop_orientation": {"value": 0, "confidence": null}
},
Expand All @@ -175,6 +178,7 @@ should yield
0.9101580212741838,
0.2080078125
],
"objectness_score": 0.5,
"confidence": 1.0,
"crop_orientation": {"value": 0, "confidence": null}
}
Expand Down
3 changes: 2 additions & 1 deletion api/app/routes/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ async def text_detection(request: DetectionIn = Depends(), files: List[UploadFil
DetectionOut(
name=filename,
geometries=[
geom[:-1].tolist() if len(geom) == 5 else resolve_geometry(geom.tolist()) for geom in doc[CLASS_NAME]
geom[:-1].tolist() if geom.shape == (5,) else resolve_geometry(geom[:4].tolist())
for geom in doc[CLASS_NAME]
],
)
for doc, filename in zip(predictor(content), filenames)
Expand Down
1 change: 1 addition & 0 deletions api/app/routes/kie.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ async def perform_kie(request: KIEIn = Depends(), files: List[UploadFile] = [Fil
dict(
value=prediction.value,
geometry=resolve_geometry(prediction.geometry),
objectness_score=round(prediction.objectness_score, 2),
confidence=round(prediction.confidence, 2),
crop_orientation=prediction.crop_orientation,
)
Expand Down
3 changes: 3 additions & 0 deletions api/app/routes/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,16 @@ async def perform_ocr(request: OCRIn = Depends(), files: List[UploadFile] = [Fil
blocks=[
OCRBlock(
geometry=resolve_geometry(block.geometry),
objectness_score=round(block.objectness_score, 2),
lines=[
OCRLine(
geometry=resolve_geometry(line.geometry),
objectness_score=round(line.objectness_score, 2),
words=[
OCRWord(
value=word.value,
geometry=resolve_geometry(word.geometry),
objectness_score=round(word.objectness_score, 2),
confidence=round(word.confidence, 2),
crop_orientation=word.crop_orientation,
)
Expand Down
12 changes: 12 additions & 0 deletions api/app/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,21 @@ class DetectionOut(BaseModel):
class OCRWord(BaseModel):
value: str = Field(..., examples=["example"])
geometry: List[float] = Field(..., examples=[[0.0, 0.0, 0.0, 0.0]])
objectness_score: float = Field(..., examples=[0.99])
confidence: float = Field(..., examples=[0.99])
crop_orientation: Dict[str, Any] = Field(..., examples=[{"value": 0, "confidence": None}])


class OCRLine(BaseModel):
geometry: List[float] = Field(..., examples=[[0.0, 0.0, 0.0, 0.0]])
objectness_score: float = Field(..., examples=[0.99])
words: List[OCRWord] = Field(
...,
examples=[
{
"value": "example",
"geometry": [0.0, 0.0, 0.0, 0.0],
"objectness_score": 0.99,
"confidence": 0.99,
"crop_orientation": {"value": 0, "confidence": None},
}
Expand All @@ -79,11 +82,13 @@ class OCRLine(BaseModel):

class OCRBlock(BaseModel):
geometry: List[float] = Field(..., examples=[[0.0, 0.0, 0.0, 0.0]])
objectness_score: float = Field(..., examples=[0.99])
lines: List[OCRLine] = Field(
...,
examples=[
{
"geometry": [0.0, 0.0, 0.0, 0.0],
"objectness_score": 0.99,
"words": [
{
"value": "example",
Expand All @@ -103,13 +108,16 @@ class OCRPage(BaseModel):
examples=[
{
"geometry": [0.0, 0.0, 0.0, 0.0],
"objectness_score": 0.99,
"lines": [
{
"geometry": [0.0, 0.0, 0.0, 0.0],
"objectness_score": 0.99,
"words": [
{
"value": "example",
"geometry": [0.0, 0.0, 0.0, 0.0],
"objectness_score": 0.99,
"confidence": 0.99,
"crop_orientation": {"value": 0, "confidence": None},
}
Expand All @@ -131,13 +139,16 @@ class OCROut(BaseModel):
examples=[
{
"geometry": [0.0, 0.0, 0.0, 0.0],
"objectness_score": 0.99,
"lines": [
{
"geometry": [0.0, 0.0, 0.0, 0.0],
"objectness_score": 0.99,
"words": [
{
"value": "example",
"geometry": [0.0, 0.0, 0.0, 0.0],
"objectness_score": 0.99,
"confidence": 0.99,
"crop_orientation": {"value": 0, "confidence": None},
}
Expand All @@ -157,6 +168,7 @@ class KIEElement(BaseModel):
{
"value": "example",
"geometry": [0.0, 0.0, 0.0, 0.0],
"objectness_score": 0.99,
"confidence": 0.99,
"crop_orientation": {"value": 0, "confidence": None},
}
Expand Down
12 changes: 12 additions & 0 deletions api/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,14 @@ def mock_kie_response():
{
"value": "Hello",
"geometry": [0.7471996155154171, 0.1796875, 0.8272978149561669, 0.20703125],
"objectness_score": 0.39,
"confidence": 1,
"crop_orientation": {"value": 0, "confidence": None},
},
{
"value": "world!",
"geometry": [0.8176307908857315, 0.1787109375, 0.9101580212741838, 0.2080078125],
"objectness_score": 0.39,
"confidence": 1,
"crop_orientation": {"value": 0, "confidence": None},
},
Expand Down Expand Up @@ -118,6 +120,7 @@ def mock_kie_response():
0.7470247745513916,
0.20540954172611237,
],
"objectness_score": 0.5,
"confidence": 0.99,
"crop_orientation": {"value": 0, "confidence": 1},
},
Expand All @@ -133,6 +136,7 @@ def mock_kie_response():
0.8173396587371826,
0.20735852420330048,
],
"objectness_score": 0.5,
"confidence": 1,
"crop_orientation": {"value": 0, "confidence": 1},
},
Expand All @@ -156,13 +160,16 @@ def mock_ocr_response():
"blocks": [
{
"geometry": [0.7471996155154171, 0.1787109375, 0.9101580212741838, 0.2080078125],
"objectness_score": 0.39,
"lines": [
{
"geometry": [0.7471996155154171, 0.1787109375, 0.9101580212741838, 0.2080078125],
"objectness_score": 0.39,
"words": [
{
"value": "Hello",
"geometry": [0.7471996155154171, 0.1796875, 0.8272978149561669, 0.20703125],
"objectness_score": 0.39,
"confidence": 1,
"crop_orientation": {"value": 0, "confidence": None},
},
Expand All @@ -174,6 +181,7 @@ def mock_ocr_response():
0.9101580212741838,
0.2080078125,
],
"objectness_score": 0.39,
"confidence": 1,
"crop_orientation": {"value": 0, "confidence": None},
},
Expand Down Expand Up @@ -204,6 +212,7 @@ def mock_ocr_response():
0.7460724711418152,
0.20930007100105286,
],
"objectness_score": 0.5,
"lines": [
{
"geometry": [
Expand All @@ -216,6 +225,7 @@ def mock_ocr_response():
0.7460724711418152,
0.20930007100105286,
],
"objectness_score": 0.5,
"words": [
{
"value": "Hello",
Expand All @@ -229,6 +239,7 @@ def mock_ocr_response():
0.7470247745513916,
0.20540954172611237,
],
"objectness_score": 0.5,
"confidence": 0.99,
"crop_orientation": {"value": 0, "confidence": 1},
},
Expand All @@ -244,6 +255,7 @@ def mock_ocr_response():
0.8173396587371826,
0.20735852420330048,
],
"objectness_score": 0.5,
"confidence": 1,
"crop_orientation": {"value": 0, "confidence": 1},
},
Expand Down
1 change: 1 addition & 0 deletions api/tests/routes/test_kie.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def common_test(json_response, expected_response):
assert isinstance(pred_item["value"], str) and pred_item["value"] == expected_pred_item["value"]
assert isinstance(pred_item["confidence"], (int, float))
np.testing.assert_allclose(pred_item["geometry"], expected_pred_item["geometry"], rtol=1e-2)
assert isinstance(pred_item["objectness_score"], (int, float))
assert isinstance(pred_item["crop_orientation"], dict)
assert isinstance(pred_item["crop_orientation"]["value"], int) and isinstance(
pred_item["crop_orientation"]["confidence"], (float, int, type(None))
Expand Down
3 changes: 3 additions & 0 deletions api/tests/routes/test_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,13 @@ def common_test(json_response, expected_response):
for item, expected_item in zip(first_pred["items"], expected_response["items"]):
for block, expected_block in zip(item["blocks"], expected_item["blocks"]):
np.testing.assert_allclose(block["geometry"], expected_block["geometry"], rtol=1e-2)
assert isinstance(block["objectness_score"], (int, float))
for line, expected_line in zip(block["lines"], expected_block["lines"]):
np.testing.assert_allclose(line["geometry"], expected_line["geometry"], rtol=1e-2)
assert isinstance(line["objectness_score"], (int, float))
for word, expected_word in zip(line["words"], expected_line["words"]):
np.testing.assert_allclose(word["geometry"], expected_word["geometry"], rtol=1e-2)
assert isinstance(word["objectness_score"], (int, float))
assert isinstance(word["value"], str) and word["value"] == expected_word["value"]
assert isinstance(word["confidence"], (int, float))
assert isinstance(word["crop_orientation"], dict)
Expand Down
3 changes: 3 additions & 0 deletions docs/source/using_doctr/using_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -378,18 +378,21 @@ For reference, here is the export for the same `Document` as above::
'value': 'No.',
'confidence': 0.914085328578949,
'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875)),
'objectness_score': 0.96,
'crop_orientation': {'value': 0, 'confidence': None},
},
{
'value': 'RECEIPT',
'confidence': 0.9949972033500671,
'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375)),
'objectness_score': 0.99,
'crop_orientation': {'value': 0, 'confidence': None},
},
{
'value': 'DATE',
'confidence': 0.9578408598899841,
'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625)),
'objectness_score': 0.99,
'crop_orientation': {'value': 0, 'confidence': None},
}
]
Expand Down
19 changes: 16 additions & 3 deletions doctr/io/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,23 +72,26 @@ class Word(Element):
confidence: the confidence associated with the text prediction
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
the page's size
objectness_score: the objectness score of the detection
crop_orientation: the general orientation of the crop in degrees and its confidence
"""

_exported_keys: List[str] = ["value", "confidence", "geometry", "crop_orientation"]
_exported_keys: List[str] = ["value", "confidence", "geometry", "objectness_score", "crop_orientation"]
_children_names: List[str] = []

def __init__(
self,
value: str,
confidence: float,
geometry: Union[BoundingBox, np.ndarray],
objectness_score: float,
crop_orientation: Dict[str, Any],
) -> None:
super().__init__()
self.value = value
self.confidence = confidence
self.geometry = geometry
self.objectness_score = objectness_score
self.crop_orientation = crop_orientation

def render(self) -> str:
Expand Down Expand Up @@ -148,15 +151,19 @@ class Line(Element):
all words in it.
"""

_exported_keys: List[str] = ["geometry"]
_exported_keys: List[str] = ["geometry", "objectness_score"]
_children_names: List[str] = ["words"]
words: List[Word] = []

def __init__(
self,
words: List[Word],
geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
objectness_score: Optional[float] = None,
) -> None:
# Compute the objectness score of the line
if objectness_score is None:
objectness_score = float(np.mean([w.objectness_score for w in words]))
# Resolve the geometry using the smallest enclosing bounding box
if geometry is None:
# Check whether this is a rotated or straight box
Expand All @@ -165,6 +172,7 @@ def __init__(

super().__init__(words=words)
self.geometry = geometry
self.objectness_score = objectness_score

def render(self) -> str:
"""Renders the full text of the element"""
Expand Down Expand Up @@ -202,7 +210,7 @@ class Block(Element):
all lines and artefacts in it.
"""

_exported_keys: List[str] = ["geometry"]
_exported_keys: List[str] = ["geometry", "objectness_score"]
_children_names: List[str] = ["lines", "artefacts"]
lines: List[Line] = []
artefacts: List[Artefact] = []
Expand All @@ -212,7 +220,11 @@ def __init__(
lines: List[Line] = [],
artefacts: List[Artefact] = [],
geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
objectness_score: Optional[float] = None,
) -> None:
# Compute the objectness score of the line
if objectness_score is None:
objectness_score = float(np.mean([w.objectness_score for line in lines for w in line.words]))
# Resolve the geometry using the smallest enclosing bounding box
if geometry is None:
line_boxes = [word.geometry for line in lines for word in line.words]
Expand All @@ -224,6 +236,7 @@ def __init__(

super().__init__(lines=lines, artefacts=artefacts)
self.geometry = geometry
self.objectness_score = objectness_score

def render(self, line_break: str = "\n") -> str:
"""Renders the full text of the element"""
Expand Down
Loading

0 comments on commit b9c4e6d

Please sign in to comment.