allow xml:id to be string using a wrapper that generates integer to m…

…aintain the compatibility with the rest of the processing
kermitt2 · lfoppiano · Mar 29, 2024 · Mar 29, 2024 · Mar 29, 2024 · Mar 29, 2024
commit 3b343c6fc9867a65df7ca19a2433961b154cc390
diff --git a/Readme.md b/Readme.md
@@ -200,7 +200,22 @@ curl --form input=@./src/test/resources/PMC1636350.pdf --form disambiguate=1 loc
 
 For PDF, each entity will be associated with a list of bounding box coordinates relative to the PDF, see [here](https://grobid.readthedocs.io/en/latest/Coordinates-in-PDF/#coordinate-system-in-the-pdf) for more explanation about the coordinate system. 
 
-In addition, the response will contain the bibliographical reference information associated to a dataset mention when found. The bibliographical information are provided in XML TEI (similar format as GROBID). 
+In addition, the response will contain the bibliographical reference information associated to a dataset mention when found. 
+The bibliographical information are provided in XML TEI (similar format as GROBID).
+
+#### /service/annotateDatasetTEI
+
+This entry-point consumes the TEI-XML file from Grobid or pub2tei. 
+
+| method | request type          | response type      | parameters         | requirement | description                                                                                                                                         |
+|--------|-----------------------|--------------------|--------------------|-------------|-----------------------------------------------------------------------------------------------------------------------------------------------------|
+| POST   | `multipart/form-data` | `application/json` | `input`            | required    | TEI file to be processed                                                                                                                            |
+|        |                       |                    | `segmentSentences` | optional    | Indicate whether to apply sentence segmentation. If the TEI was segmented before (by Grobid, for example) this should be set to '0'.                |
+
+[//]: # (|        |                       |                    | `disambiguate`     | optional    | `disambiguate` is a string of value `0` &#40;no disambiguation, default value&#41; or `1` &#40;disambiguate and inject Wikidata entity id and Wikipedia pageId&#41; |)
+
+
+Using ```curl``` POST request with a __TEI-XML file__:
 
 
 ## Contact and License

diff --git a/src/main/java/org/grobid/core/data/BiblioComponent.java b/src/main/java/org/grobid/core/data/BiblioComponent.java
@@ -29,7 +29,7 @@ public class BiblioComponent extends DatasetComponent {
     // the full matched bibliographical reference record
     protected BiblioItem biblio = null;
 
-    // identifier for relating callout and reference, should be cconsistent with 
+    // identifier for relating callout and reference, should be consistent with
     // a full text TEI produced by GROBID
     protected int refKey = -1;
 

diff --git a/src/main/java/org/grobid/core/data/BiblioComponentWrapper.java b/src/main/java/org/grobid/core/data/BiblioComponentWrapper.java
@@ -0,0 +1,49 @@
+package org.grobid.core.data;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+
+public class BiblioComponentWrapper {
+    private Map<String, Integer> stringToRefKeyMap;
+    private Map<Integer, String> refKeyToStringMap;
+    private AtomicInteger refKeyGenerator;
+
+    public BiblioComponentWrapper() {
+        stringToRefKeyMap = new HashMap<>();
+        refKeyToStringMap = new HashMap<>();
+        refKeyGenerator = new AtomicInteger(0);
+    }
+
+    public void addMapping(String refKeyString) {
+        if (!stringToRefKeyMap.containsKey(refKeyString)) {
+            int refKey = refKeyGenerator.incrementAndGet();
+            stringToRefKeyMap.put(refKeyString, refKey);
+            refKeyToStringMap.put(refKey, refKeyString);
+        }
+    }
+
+    public Integer getRefKey(String refKeyString) {
+        String refKeyStringClean = refKeyString.replaceFirst("^#", "");
+        addMapping(refKeyStringClean);
+        return stringToRefKeyMap.get(refKeyStringClean);
+    }
+
+    public String getRefKeyString(int refKey) {
+        return refKeyToStringMap.get(refKey);
+    }
+
+    public void removeMapping(String refKeyString) {
+        Integer refKey = stringToRefKeyMap.remove(refKeyString);
+        if (refKey != null) {
+            refKeyToStringMap.remove(refKey);
+        }
+    }
+
+    public void removeMapping(int refKey) {
+        String refKeyString = refKeyToStringMap.remove(refKey);
+        if (refKeyString != null) {
+            stringToRefKeyMap.remove(refKeyString);
+        }
+    }
+}
diff --git a/src/main/java/org/grobid/core/engines/DatasetParser.java b/src/main/java/org/grobid/core/engines/DatasetParser.java
@@ -962,7 +962,7 @@ public Pair<List<List<Dataset>>, Document> processPDF(File file,
                             TaggingLabel clusterLabel = cluster.getTaggingLabel();
 
                             List<LayoutToken> localTokenization = cluster.concatTokens();
-                            if ((localTokenization == null) || (localTokenization.size() == 0))
+                            if (CollectionUtils.isEmpty(localTokenization))
                                 continue;
 
                             if (clusterLabel.equals(TaggingLabels.CITATION_MARKER)) {
@@ -1937,7 +1937,10 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
         }
 
         // Read and parse references
-        Map<String, Pair<String, org.w3c.dom.Node>> referenceMap = new HashMap<>();
+
+        BiblioComponentWrapper biblioComponentWrapper = new BiblioComponentWrapper();
+
+        Map<Integer, Pair<String, org.w3c.dom.Node>> referenceMap = new HashMap<>();
         try {
             String expression = "//*[local-name() = 'div'][@*[local-name()='type' and .='references']]/*[local-name() = 'listBibl']/*[local-name() = 'biblStruct']";
             org.w3c.dom.NodeList bodyNodeList = (org.w3c.dom.NodeList) xPath.evaluate(expression,
@@ -1953,7 +1956,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
                             String referenceText = item.getTextContent();
                             String normalizedReferenceText = normalize(referenceText);
                             String cleanedRawReferenceText = normalizedReferenceText.replaceAll("\\p{Space}+", " ").strip().replaceAll("[ ]{2,}", ", ");
-                            referenceMap.put(attribute.getNodeValue(), Pair.of(cleanedRawReferenceText, item));
+                            referenceMap.put(biblioComponentWrapper.getRefKey(attribute.getNodeValue()), Pair.of(cleanedRawReferenceText, item));
                         }
                     }
                 }
@@ -1974,6 +1977,18 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
                         .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)))
                 .collect(Collectors.toList());
 
+//                List<Map<String, Triple<OffsetPosition, String, String>>> referencesInSequences = selectedSequences.stream()
+//                .map(sequence -> sequence.getReferences().entrySet().stream()
+//                        .filter(entry -> BIBLIO_CALLOUT_TYPE.equals(entry.getValue().getRight()))
+//                        .collect(
+//                                Collectors.toMap(
+//                                        entry -> String.valueOf(biblioComponentWrapper.getRefKey(entry.getValue().getMiddle())),
+//                                        Map.Entry::getValue
+//                                )
+//                        )
+//                )
+//                .collect(Collectors.toList());
+
 //        List<Map<String, Triple<OffsetPosition, String, String>>> referencesList = selectedSequences.stream()
 //                .map(DatasetDocumentSequence::getReferences)
 //                .filter(map -> map.values().stream()
@@ -1990,15 +2005,16 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
                 String target = infos.getMiddle();
                 OffsetPosition position = infos.getLeft();
 
-                Pair<String, org.w3c.dom.Node> referenceInformation = referenceMap.get(target);
+                Pair<String, org.w3c.dom.Node> referenceInformation = referenceMap.get(biblioComponentWrapper.getRefKey(target));
                 if (referenceInformation != null) {
                     BiblioItem biblioItem = XMLUtilities.parseTEIBiblioItem((org.w3c.dom.Element) referenceInformation.getRight());
                     String refTextClean = refText.replaceAll("[\\[\\], ]+", "");
 
                     biblioRefMap.put(refTextClean, biblioItem);
-                    BiblioComponent biblioComponent = new BiblioComponent(
-                            biblioItem, Integer.parseInt(target.replace("b", ""))
-                    );
+
+                    Integer refKey = biblioComponentWrapper.getRefKey(target);
+
+                    BiblioComponent biblioComponent = new BiblioComponent(biblioItem, refKey);
                     biblioComponent.setRawForm(refText);
                     biblioComponent.setOffsetStart(position.start);
                     biblioComponent.setOffsetEnd(position.end);
@@ -2238,7 +2254,7 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
         return Pair.of(entities, citationsToConsolidate);
     }
 
-    private static String normalize(String text) {
+    public static String normalize(String text) {
         String normalizedText = UnicodeUtil.normaliseText(text);
         normalizedText = normalizedText.replace("\n", " ");
         normalizedText = normalizedText.replace("\t", " ");

diff --git a/src/main/java/org/grobid/core/utilities/XMLUtilities.java b/src/main/java/org/grobid/core/utilities/XMLUtilities.java
@@ -33,6 +33,8 @@
 import java.io.StringWriter;
 import java.util.*;
 
+import static org.grobid.core.engines.DatasetParser.normalize;
+
 /**
  *  Some convenient methods for suffering a bit less with XML.
  */
@@ -82,7 +84,7 @@ public static String toPrettyString(String xml, int indent) {
 
     public static Element getFirstDirectChild(Element parent, String name) {
         for(Node child = parent.getFirstChild(); child != null; child = child.getNextSibling()) {
-            if (child instanceof Element && name.equals(child.getNodeName())) 
+            if (child instanceof Element && name.equals(child.getNodeName()))
                 return (Element) child;
         }
         return null;
@@ -91,8 +93,8 @@ public static Element getFirstDirectChild(Element parent, String name) {
     public static Element getLastDirectChild(Element parent, String name) {
         NodeList children = parent.getChildNodes();
         for(int j=children.getLength()-1; j>0; j--) {
-            Node child = children.item(j); 
-            if (child instanceof Element && name.equals(child.getNodeName())) 
+            Node child = children.item(j);
+            if (child instanceof Element && name.equals(child.getNodeName()))
                 return (Element) child;
         }
         return null;
@@ -123,7 +125,7 @@ public static BiblioItem parseTEIBiblioItem(org.w3c.dom.Element biblStructElemen
         } catch(Exception e) {
             if (teiXML != null)
                 LOGGER.warn("The parsing of the biblStruct from TEI document failed for: " + teiXML);
-            else 
+            else
                 LOGGER.warn("The parsing of the biblStruct from TEI document failed for: " + biblStructElement.toString());
         }
         return handler.getBiblioItem();
@@ -138,7 +140,7 @@ public static String getTextNoRefMarkers(Element element) {
             if (node.getNodeType() == Node.ELEMENT_NODE) {
                 if ("ref".equals(node.getNodeName()))
                     continue;
-            } 
+            }
             if (node.getNodeType() == Node.TEXT_NODE) {
                 buf.append(node.getNodeValue());
                 found = true;
@@ -147,6 +149,19 @@ public static String getTextNoRefMarkers(Element element) {
         return found ? buf.toString() : null;
     }
 
+    public static String getTextRecursively(Node node) {
+        StringBuilder textContent = new StringBuilder();
+        NodeList children = node.getChildNodes();
+        for (int i = 0; i < children.getLength(); i++) {
+            Node child = children.item(i);
+            if (child.getNodeType() == Node.TEXT_NODE) {
+                textContent.append(child.getNodeValue());
+            } else if (child.getNodeType() == Node.ELEMENT_NODE) {
+                textContent.append(getTextRecursively(child));
+            }
+        }
+        return textContent.toString();
+    }
     /**
      * @return Pair with text or null on the left and a Triple with (position, target and type)
      */
@@ -181,16 +196,16 @@ public static Pair<String, Map<String,Triple<OffsetPosition, String, String>>> g
                     for (int j = 0; j < list2.getLength(); j++) {
                         Node subChildNode = list2.item(j);
                         if (subChildNode.getNodeType() == Node.TEXT_NODE) {
-                            String chunk = subChildNode.getNodeValue();
+                            String chunk = normalize(getTextRecursively(node));
 
                             if (BIBLIO_CALLOUT_TYPE.equals(((Element) node).getAttribute("type"))) {
                                 Triple<OffsetPosition, String, String> refInfo = Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, BIBLIO_CALLOUT_TYPE);
-                                right.put(chunk, refInfo);
+                                right.put(StringUtils.strip(chunk), refInfo);
                                 String holder = StringUtils.repeat(" ", chunk.length());
                                 buf.append(holder);
                             } else if (URI_TYPE.equals(((Element) node).getAttribute("type")) || URL_TYPE.equals(((Element) node).getAttribute("type"))) {
                                 org.apache.commons.lang3.tuple.Triple<OffsetPosition, String, String> urlInfo = org.apache.commons.lang3.tuple.Triple.of(new OffsetPosition(indexPos, indexPos+chunk.length()), target, URL_TYPE);
-                                right.put(chunk, urlInfo);
+                                right.put(StringUtils.strip(chunk), urlInfo);
                                 // we still add added like normal text
                                 buf.append(chunk);
                                 found = true;
@@ -254,8 +269,8 @@ public static String serialize(org.w3c.dom.Document doc, Node node) {
             XPathFactory xpathFactory = XPathFactory.newInstance();
             // XPath to find empty text nodes.
             XPathExpression xpathExp = xpathFactory.newXPath().compile(
-                    "//text()[normalize-space(.) = '']");  
-            NodeList emptyTextNodes = (NodeList) 
+                    "//text()[normalize-space(.) = '']");
+            NodeList emptyTextNodes = (NodeList)
                     xpathExp.evaluate(doc, XPathConstants.NODESET);
 
             // Remove each empty text node from document.
@@ -368,7 +383,7 @@ public static void cleanXMLCorpus(String documentPath) throws Exception {
         // Return pretty print xml string
         StringWriter stringWriter = new StringWriter();
         transformer.transform(new DOMSource(document), new StreamResult(stringWriter));
-        
+
         // write result to file
         FileUtils.writeStringToFile(outputFile, stringWriter.toString(), "UTF-8");
 
@@ -386,7 +401,7 @@ public static void cleanXMLCorpus(String documentPath) throws Exception {
 
     /**
      * Return the document ID where the annotation is located
-     */ 
+     */
     private static String getDocIdFromRs(org.w3c.dom.Node node) {
         String result = null;
         // first go up to the tei element root
@@ -423,11 +438,11 @@ private static String getDocIdFromRs(org.w3c.dom.Node node) {
     }
 
     public static String stripNonValidXMLCharacters(String in) {
-        StringBuffer out = new StringBuffer(); 
-        char current; 
+        StringBuffer out = new StringBuffer();
+        char current;
 
-        if (in == null || ("".equals(in))) 
-            return ""; 
+        if (in == null || ("".equals(in)))
+            return "";
         for (int i = 0; i < in.length(); i++) {
             current = in.charAt(i); // NOTE: No IndexOutOfBoundsException caught here; it should not happen.
             if ((current == 0x9) ||
@@ -439,7 +454,7 @@ public static String stripNonValidXMLCharacters(String in) {
                 out.append(current);
         }
         return out.toString();
-    }    
+    }
 
     private static List<String> textualElements = Arrays.asList("p", "figDesc");
 
@@ -451,7 +466,7 @@ public static void segment(org.w3c.dom.Document doc, Node node) {
         final NodeList children = node.getChildNodes();
         for (int i = 0; i < children.getLength(); i++) {
             final Node n = children.item(i);
-            if ( (n.getNodeType() == Node.ELEMENT_NODE) && 
+            if ( (n.getNodeType() == Node.ELEMENT_NODE) &&
                  (textualElements.contains(n.getNodeName())) ) {
                 // text content
                 //String text = n.getTextContent();
@@ -492,7 +507,7 @@ public static void segment(org.w3c.dom.Document doc, Node node) {
                     try {
                         DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
                         factory.setNamespaceAware(true);
-                        org.w3c.dom.Document d = factory.newDocumentBuilder().parse(new InputSource(new StringReader(fullSent)));                
+                        org.w3c.dom.Document d = factory.newDocumentBuilder().parse(new InputSource(new StringReader(fullSent)));
                     } catch(Exception e) {
                         fail = true;
                     }
@@ -509,7 +524,7 @@ public static void segment(org.w3c.dom.Document doc, Node node) {
                     //System.out.println("-----------------");
                     sent = sent.replace("\n", " ");
                     sent = sent.replaceAll("( )+", " ");
-                
+
                     //Element sentenceElement = doc.createElement("s");                        
                     //sentenceElement.setTextContent(sent);
                     //newNodes.add(sentenceElement);
@@ -539,12 +554,12 @@ public static void segment(org.w3c.dom.Document doc, Node node) {
                 if (n.getNodeName().equals("figDesc")) {
                     Element theDiv = doc.createElementNS("http://www.tei-c.org/ns/1.0", "div");
                     Element theP = doc.createElementNS("http://www.tei-c.org/ns/1.0", "p");
-                    for(Node theNode : newNodes) 
+                    for(Node theNode : newNodes)
                         theP.appendChild(theNode);
                     theDiv.appendChild(theP);
                     n.appendChild(theDiv);
                 } else {
-                    for(Node theNode : newNodes) 
+                    for(Node theNode : newNodes)
                         n.appendChild(theNode);
                 }
 
@@ -561,7 +576,7 @@ public static void segment(org.w3c.dom.Document doc, Node node) {
      * @param args Command line arguments.
      */
     public static void main(String[] args) {
-       
+
         // we are expecting one argument, absolute path to the TEICorpus document
 
         if (args.length != 1) {