[builder] Add objectness scores (#1625)

mindee · Jun 7, 2024 · b9c4e6d · b9c4e6d
1 parent 674a875
commit b9c4e6d
Show file tree

Hide file tree

Showing 23 changed files with 247 additions and 50 deletions.
diff --git a/api/README.md b/api/README.md
@@ -147,6 +147,7 @@ should yield
               0.9101580212741838,
               0.2080078125
             ],
+            "objectness_score": 0.5,
             "lines": [
               {
                 "geometry": [
@@ -155,6 +156,7 @@ should yield
                   0.9101580212741838,
                   0.2080078125
                 ],
+                "objectness_score": 0.5,
                 "words": [
                   {
                     "value": "Hello",
@@ -164,6 +166,7 @@ should yield
                       0.8272978149561669,
                       0.20703125
                     ],
+                    "objectness_score": 0.5,
                     "confidence": 1.0,
                     "crop_orientation": {"value": 0, "confidence": null}
                   },
@@ -175,6 +178,7 @@ should yield
                       0.9101580212741838,
                       0.2080078125
                     ],
+                    "objectness_score": 0.5,
                     "confidence": 1.0,
                     "crop_orientation": {"value": 0, "confidence": null}
                   }

diff --git a/api/app/routes/detection.py b/api/app/routes/detection.py
@@ -28,7 +28,8 @@ async def text_detection(request: DetectionIn = Depends(), files: List[UploadFil
         DetectionOut(
             name=filename,
             geometries=[
-                geom[:-1].tolist() if len(geom) == 5 else resolve_geometry(geom.tolist()) for geom in doc[CLASS_NAME]
+                geom[:-1].tolist() if geom.shape == (5,) else resolve_geometry(geom[:4].tolist())
+                for geom in doc[CLASS_NAME]
             ],
         )
         for doc, filename in zip(predictor(content), filenames)

diff --git a/api/app/routes/kie.py b/api/app/routes/kie.py
@@ -38,6 +38,7 @@ async def perform_kie(request: KIEIn = Depends(), files: List[UploadFile] = [Fil
                         dict(
                             value=prediction.value,
                             geometry=resolve_geometry(prediction.geometry),
+                            objectness_score=round(prediction.objectness_score, 2),
                             confidence=round(prediction.confidence, 2),
                             crop_orientation=prediction.crop_orientation,
                         )

diff --git a/api/app/routes/ocr.py b/api/app/routes/ocr.py
@@ -37,13 +37,16 @@ async def perform_ocr(request: OCRIn = Depends(), files: List[UploadFile] = [Fil
                     blocks=[
                         OCRBlock(
                             geometry=resolve_geometry(block.geometry),
+                            objectness_score=round(block.objectness_score, 2),
                             lines=[
                                 OCRLine(
                                     geometry=resolve_geometry(line.geometry),
+                                    objectness_score=round(line.objectness_score, 2),
                                     words=[
                                         OCRWord(
                                             value=word.value,
                                             geometry=resolve_geometry(word.geometry),
+                                            objectness_score=round(word.objectness_score, 2),
                                             confidence=round(word.confidence, 2),
                                             crop_orientation=word.crop_orientation,
                                         )

diff --git a/api/app/schemas.py b/api/app/schemas.py
@@ -58,18 +58,21 @@ class DetectionOut(BaseModel):
 class OCRWord(BaseModel):
     value: str = Field(..., examples=["example"])
     geometry: List[float] = Field(..., examples=[[0.0, 0.0, 0.0, 0.0]])
+    objectness_score: float = Field(..., examples=[0.99])
     confidence: float = Field(..., examples=[0.99])
     crop_orientation: Dict[str, Any] = Field(..., examples=[{"value": 0, "confidence": None}])
 
 
 class OCRLine(BaseModel):
     geometry: List[float] = Field(..., examples=[[0.0, 0.0, 0.0, 0.0]])
+    objectness_score: float = Field(..., examples=[0.99])
     words: List[OCRWord] = Field(
         ...,
         examples=[
             {
                 "value": "example",
                 "geometry": [0.0, 0.0, 0.0, 0.0],
+                "objectness_score": 0.99,
                 "confidence": 0.99,
                 "crop_orientation": {"value": 0, "confidence": None},
             }
@@ -79,11 +82,13 @@ class OCRLine(BaseModel):
 
 class OCRBlock(BaseModel):
     geometry: List[float] = Field(..., examples=[[0.0, 0.0, 0.0, 0.0]])
+    objectness_score: float = Field(..., examples=[0.99])
     lines: List[OCRLine] = Field(
         ...,
         examples=[
             {
                 "geometry": [0.0, 0.0, 0.0, 0.0],
+                "objectness_score": 0.99,
                 "words": [
                     {
                         "value": "example",
@@ -103,13 +108,16 @@ class OCRPage(BaseModel):
         examples=[
             {
                 "geometry": [0.0, 0.0, 0.0, 0.0],
+                "objectness_score": 0.99,
                 "lines": [
                     {
                         "geometry": [0.0, 0.0, 0.0, 0.0],
+                        "objectness_score": 0.99,
                         "words": [
                             {
                                 "value": "example",
                                 "geometry": [0.0, 0.0, 0.0, 0.0],
+                                "objectness_score": 0.99,
                                 "confidence": 0.99,
                                 "crop_orientation": {"value": 0, "confidence": None},
                             }
@@ -131,13 +139,16 @@ class OCROut(BaseModel):
         examples=[
             {
                 "geometry": [0.0, 0.0, 0.0, 0.0],
+                "objectness_score": 0.99,
                 "lines": [
                     {
                         "geometry": [0.0, 0.0, 0.0, 0.0],
+                        "objectness_score": 0.99,
                         "words": [
                             {
                                 "value": "example",
                                 "geometry": [0.0, 0.0, 0.0, 0.0],
+                                "objectness_score": 0.99,
                                 "confidence": 0.99,
                                 "crop_orientation": {"value": 0, "confidence": None},
                             }
@@ -157,6 +168,7 @@ class KIEElement(BaseModel):
             {
                 "value": "example",
                 "geometry": [0.0, 0.0, 0.0, 0.0],
+                "objectness_score": 0.99,
                 "confidence": 0.99,
                 "crop_orientation": {"value": 0, "confidence": None},
             }

diff --git a/api/tests/conftest.py b/api/tests/conftest.py
@@ -84,12 +84,14 @@ def mock_kie_response():
                         {
                             "value": "Hello",
                             "geometry": [0.7471996155154171, 0.1796875, 0.8272978149561669, 0.20703125],
+                            "objectness_score": 0.39,
                             "confidence": 1,
                             "crop_orientation": {"value": 0, "confidence": None},
                         },
                         {
                             "value": "world!",
                             "geometry": [0.8176307908857315, 0.1787109375, 0.9101580212741838, 0.2080078125],
+                            "objectness_score": 0.39,
                             "confidence": 1,
                             "crop_orientation": {"value": 0, "confidence": None},
                         },
@@ -118,6 +120,7 @@ def mock_kie_response():
                                 0.7470247745513916,
                                 0.20540954172611237,
                             ],
+                            "objectness_score": 0.5,
                             "confidence": 0.99,
                             "crop_orientation": {"value": 0, "confidence": 1},
                         },
@@ -133,6 +136,7 @@ def mock_kie_response():
                                 0.8173396587371826,
                                 0.20735852420330048,
                             ],
+                            "objectness_score": 0.5,
                             "confidence": 1,
                             "crop_orientation": {"value": 0, "confidence": 1},
                         },
@@ -156,13 +160,16 @@ def mock_ocr_response():
                     "blocks": [
                         {
                             "geometry": [0.7471996155154171, 0.1787109375, 0.9101580212741838, 0.2080078125],
+                            "objectness_score": 0.39,
                             "lines": [
                                 {
                                     "geometry": [0.7471996155154171, 0.1787109375, 0.9101580212741838, 0.2080078125],
+                                    "objectness_score": 0.39,
                                     "words": [
                                         {
                                             "value": "Hello",
                                             "geometry": [0.7471996155154171, 0.1796875, 0.8272978149561669, 0.20703125],
+                                            "objectness_score": 0.39,
                                             "confidence": 1,
                                             "crop_orientation": {"value": 0, "confidence": None},
                                         },
@@ -174,6 +181,7 @@ def mock_ocr_response():
                                                 0.9101580212741838,
                                                 0.2080078125,
                                             ],
+                                            "objectness_score": 0.39,
                                             "confidence": 1,
                                             "crop_orientation": {"value": 0, "confidence": None},
                                         },
@@ -204,6 +212,7 @@ def mock_ocr_response():
                                 0.7460724711418152,
                                 0.20930007100105286,
                             ],
+                            "objectness_score": 0.5,
                             "lines": [
                                 {
                                     "geometry": [
@@ -216,6 +225,7 @@ def mock_ocr_response():
                                         0.7460724711418152,
                                         0.20930007100105286,
                                     ],
+                                    "objectness_score": 0.5,
                                     "words": [
                                         {
                                             "value": "Hello",
@@ -229,6 +239,7 @@ def mock_ocr_response():
                                                 0.7470247745513916,
                                                 0.20540954172611237,
                                             ],
+                                            "objectness_score": 0.5,
                                             "confidence": 0.99,
                                             "crop_orientation": {"value": 0, "confidence": 1},
                                         },
@@ -244,6 +255,7 @@ def mock_ocr_response():
                                                 0.8173396587371826,
                                                 0.20735852420330048,
                                             ],
+                                            "objectness_score": 0.5,
                                             "confidence": 1,
                                             "crop_orientation": {"value": 0, "confidence": 1},
                                         },

diff --git a/api/tests/routes/test_kie.py b/api/tests/routes/test_kie.py
@@ -22,6 +22,7 @@ def common_test(json_response, expected_response):
             assert isinstance(pred_item["value"], str) and pred_item["value"] == expected_pred_item["value"]
             assert isinstance(pred_item["confidence"], (int, float))
             np.testing.assert_allclose(pred_item["geometry"], expected_pred_item["geometry"], rtol=1e-2)
+            assert isinstance(pred_item["objectness_score"], (int, float))
             assert isinstance(pred_item["crop_orientation"], dict)
             assert isinstance(pred_item["crop_orientation"]["value"], int) and isinstance(
                 pred_item["crop_orientation"]["confidence"], (float, int, type(None))

diff --git a/api/tests/routes/test_ocr.py b/api/tests/routes/test_ocr.py
@@ -14,10 +14,13 @@ def common_test(json_response, expected_response):
     for item, expected_item in zip(first_pred["items"], expected_response["items"]):
         for block, expected_block in zip(item["blocks"], expected_item["blocks"]):
             np.testing.assert_allclose(block["geometry"], expected_block["geometry"], rtol=1e-2)
+            assert isinstance(block["objectness_score"], (int, float))
             for line, expected_line in zip(block["lines"], expected_block["lines"]):
                 np.testing.assert_allclose(line["geometry"], expected_line["geometry"], rtol=1e-2)
+                assert isinstance(line["objectness_score"], (int, float))
                 for word, expected_word in zip(line["words"], expected_line["words"]):
                     np.testing.assert_allclose(word["geometry"], expected_word["geometry"], rtol=1e-2)
+                    assert isinstance(word["objectness_score"], (int, float))
                     assert isinstance(word["value"], str) and word["value"] == expected_word["value"]
                     assert isinstance(word["confidence"], (int, float))
                     assert isinstance(word["crop_orientation"], dict)

diff --git a/docs/source/using_doctr/using_models.rst b/docs/source/using_doctr/using_models.rst
@@ -378,18 +378,21 @@ For reference, here is the export for the same `Document` as above::
                                     'value': 'No.',
                                     'confidence': 0.914085328578949,
                                     'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875)),
+                                    'objectness_score': 0.96,
                                     'crop_orientation': {'value': 0, 'confidence': None},
                                 },
                                 {
                                     'value': 'RECEIPT',
                                     'confidence': 0.9949972033500671,
                                     'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375)),
+                                    'objectness_score': 0.99,
                                     'crop_orientation': {'value': 0, 'confidence': None},
                                 },
                                 {
                                     'value': 'DATE',
                                     'confidence': 0.9578408598899841,
                                     'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625)),
+                                    'objectness_score': 0.99,
                                     'crop_orientation': {'value': 0, 'confidence': None},
                                 }
                             ]

diff --git a/doctr/io/elements.py b/doctr/io/elements.py
@@ -72,23 +72,26 @@ class Word(Element):
         confidence: the confidence associated with the text prediction
         geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
         the page's size
+        objectness_score: the objectness score of the detection
         crop_orientation: the general orientation of the crop in degrees and its confidence
     """
 
-    _exported_keys: List[str] = ["value", "confidence", "geometry", "crop_orientation"]
+    _exported_keys: List[str] = ["value", "confidence", "geometry", "objectness_score", "crop_orientation"]
     _children_names: List[str] = []
 
     def __init__(
         self,
         value: str,
         confidence: float,
         geometry: Union[BoundingBox, np.ndarray],
+        objectness_score: float,
         crop_orientation: Dict[str, Any],
     ) -> None:
         super().__init__()
         self.value = value
         self.confidence = confidence
         self.geometry = geometry
+        self.objectness_score = objectness_score
         self.crop_orientation = crop_orientation
 
     def render(self) -> str:
@@ -148,15 +151,19 @@ class Line(Element):
             all words in it.
     """
 
-    _exported_keys: List[str] = ["geometry"]
+    _exported_keys: List[str] = ["geometry", "objectness_score"]
     _children_names: List[str] = ["words"]
     words: List[Word] = []
 
     def __init__(
         self,
         words: List[Word],
         geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
+        objectness_score: Optional[float] = None,
     ) -> None:
+        # Compute the objectness score of the line
+        if objectness_score is None:
+            objectness_score = float(np.mean([w.objectness_score for w in words]))
         # Resolve the geometry using the smallest enclosing bounding box
         if geometry is None:
             # Check whether this is a rotated or straight box
@@ -165,6 +172,7 @@ def __init__(
 
         super().__init__(words=words)
         self.geometry = geometry
+        self.objectness_score = objectness_score
 
     def render(self) -> str:
         """Renders the full text of the element"""
@@ -202,7 +210,7 @@ class Block(Element):
             all lines and artefacts in it.
     """
 
-    _exported_keys: List[str] = ["geometry"]
+    _exported_keys: List[str] = ["geometry", "objectness_score"]
     _children_names: List[str] = ["lines", "artefacts"]
     lines: List[Line] = []
     artefacts: List[Artefact] = []
@@ -212,7 +220,11 @@ def __init__(
         lines: List[Line] = [],
         artefacts: List[Artefact] = [],
         geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
+        objectness_score: Optional[float] = None,
     ) -> None:
+        # Compute the objectness score of the line
+        if objectness_score is None:
+            objectness_score = float(np.mean([w.objectness_score for line in lines for w in line.words]))
         # Resolve the geometry using the smallest enclosing bounding box
         if geometry is None:
             line_boxes = [word.geometry for line in lines for word in line.words]
@@ -224,6 +236,7 @@ def __init__(
 
         super().__init__(lines=lines, artefacts=artefacts)
         self.geometry = geometry
+        self.objectness_score = objectness_score
 
     def render(self, line_break: str = "\n") -> str:
         """Renders the full text of the element"""