Merge pull request molgenis#9352 from tommydeboer/feat/scrambler

Auto id pattern scrambler
sidohaakma · Nov 25, 2021 · f4b655c · f4b655c
2 parents d319250 + 58d25d3
commit f4b655c
Showing 7 changed files with 98 additions and 20 deletions.
diff --git a/docs/guide-pseudonymisation.md b/docs/guide-pseudonymisation.md
@@ -11,6 +11,19 @@ GEN-0000003
 etc.
 ```
 
+Besides incrementing sequentially, the digit part of th ids can also be scrambled:
+
+```
+GEN-5720385
+GEN-1398822
+GEN-9401776
+etc.
+```
+These are not random: the generated ids are guaranteed to be unique and are based on the incrementing sequence.
+
+> Note: Be aware that the scrambled identifiers _will_ repeat when all possibilities are exhausted. So
+> make sure that the length of the digit-part is sufficient for your use case!
+
 ## How to configure
 To configure an attribute as an incrementing identifier, first make sure the attribute has the following properties:
 ```
@@ -37,6 +50,12 @@ your EMX file:
 |-----------|----------|----------|-------------|---------------------------------|
 | pseudonym | string   | false    | AUTO        | hasIDDigitCount7,hasIDPrefixGen |
 
+If you want the identifiers to be scrambled, you should also add the `scrambled` tag to the attribute:
+
+| name      | dataType | nillable | idAttribute | tags                                      |
+|-----------|----------|----------|-------------|-------------------------------------------|
+| pseudonym | string   | false    | AUTO        | hasIDDigitCount7,hasIDPrefixGen,scrambled |
+
 ## Endpoints
 To keep track of a sequence's current value, it is stored in the database. To interact
 with a sequence there are two endpoints available:

diff --git a/molgenis-bootstrap/src/main/java/org/molgenis/bootstrap/populate/TagPopulator.java b/molgenis-bootstrap/src/main/java/org/molgenis/bootstrap/populate/TagPopulator.java
@@ -6,6 +6,7 @@
 import static org.molgenis.data.semantic.Relation.type;
 import static org.molgenis.data.semantic.Vocabulary.AUDIT_USAGE;
 import static org.molgenis.data.semantic.Vocabulary.CASE_SENSITIVE;
+import static org.molgenis.data.semantic.Vocabulary.SCRAMBLED;
 
 import java.util.List;
 import org.molgenis.data.DataService;
@@ -43,8 +44,14 @@ public void populate() {
     audited.setRelationIri(isAudited.getIRI());
     audited.setRelationLabel(isAudited.getLabel());
 
+    Tag scrambled = tagFactory.create("scrambled");
+    scrambled.setLabel("Scrambled");
+    scrambled.setObjectIri(SCRAMBLED.toString());
+    scrambled.setRelationIri(type.getIRI());
+    scrambled.setRelationLabel(type.getLabel());
+
     dataService
         .getRepository(TAG, Tag.class)
-        .upsertBatch(List.of(isAToken, isCaseSensitive, audited));
+        .upsertBatch(List.of(isAToken, isCaseSensitive, audited, scrambled));
   }
 }
diff --git a/molgenis-bootstrap/src/test/java/org/molgenis/bootstrap/populate/TagPopulatorTest.java b/molgenis-bootstrap/src/test/java/org/molgenis/bootstrap/populate/TagPopulatorTest.java
@@ -26,6 +26,7 @@ class TagPopulatorTest {
   @Mock Tag token;
   @Mock Tag caseSensitive;
   @Mock Tag audited;
+  @Mock Tag scrambled;
   @Mock Repository<Tag> tagRepository;
 
   private TagPopulator tagPopulator;
@@ -40,12 +41,13 @@ public void testPopulate() {
     when(tagFactory.create("token")).thenReturn(token);
     when(tagFactory.create("case-sensitive")).thenReturn(caseSensitive);
     when(tagFactory.create("audit-usage")).thenReturn(audited);
+    when(tagFactory.create("scrambled")).thenReturn(scrambled);
 
     when(dataService.getRepository(TagMetadata.TAG, Tag.class)).thenReturn(tagRepository);
 
     tagPopulator.populate();
 
-    verify(tagRepository).upsertBatch(List.of(token, caseSensitive, audited));
+    verify(tagRepository).upsertBatch(List.of(token, caseSensitive, audited, scrambled));
     verify(token).setRelationIri(RDF.TYPE.toString());
     verify(token).setObjectIri(XMLSchema.TOKEN.toString());
 
@@ -54,5 +56,8 @@ public void testPopulate() {
 
     verify(audited).setRelationIri(isAudited.getIRI());
     verify(audited).setObjectIri(Vocabulary.AUDIT_USAGE.toString());
+
+    verify(scrambled).setRelationIri(RDF.TYPE.toString());
+    verify(scrambled).setObjectIri(Vocabulary.SCRAMBLED.toString());
   }
 }
diff --git a/molgenis-data/src/main/java/org/molgenis/data/populate/AutoValuePopulator.java b/molgenis-data/src/main/java/org/molgenis/data/populate/AutoValuePopulator.java
@@ -1,6 +1,5 @@
 package org.molgenis.data.populate;
 
-import static com.google.common.collect.Iterables.tryFind;
 import static com.google.common.collect.Streams.stream;
 import static java.util.Collections.singletonList;
 import static java.util.Objects.requireNonNull;
@@ -9,6 +8,8 @@
 import static org.molgenis.data.meta.AttributeType.STRING;
 import static org.molgenis.data.semantic.Relation.hasIDDigitCount;
 import static org.molgenis.data.semantic.Relation.hasIDPrefix;
+import static org.molgenis.data.semantic.Relation.type;
+import static org.molgenis.data.semantic.Vocabulary.SCRAMBLED;
 
 import java.text.DecimalFormat;
 import java.time.Instant;
@@ -19,12 +20,14 @@
 import org.molgenis.data.meta.AttributeType;
 import org.molgenis.data.meta.model.Attribute;
 import org.molgenis.data.meta.model.Tag;
+import org.molgenis.util.IntScrambler;
 import org.molgenis.util.UnexpectedEnumException;
 import org.springframework.stereotype.Component;
 
 /** Populate entity values for auto attributes */
 @Component
 public class AutoValuePopulator {
+
   private final IdGenerator idGenerator;
   private final Sequences sequences;
 
@@ -54,30 +57,50 @@ public void populate(Entity entity) {
   }
 
   /**
-   * Generates a new sequence ID if the attribute is tagged with ID prefix and ID digit count
+   * Generates a new sequence ID if the attribute is tagged with ID prefix and ID digit count. If
+   * the ID is also tagged with "scrambled", it will scramble the digit part.
    *
    * @param attribute the ID attribute
    * @return formatted ID, in sequence with
    */
   private Optional<String> generateFormattedSequenceId(Attribute attribute) {
-    return Optional.ofNullable(attribute.getTags())
+    return stream(attribute.getTags())
+        .filter(this::isIdPrefix)
+        .findFirst()
+        .map(Tag::getValue)
         .flatMap(
-            tags ->
-                tryFind(tags, tag -> hasIDPrefix.getIRI().equals(tag.getRelationIri()))
-                    .toJavaUtil()
+            idPrefix ->
+                stream(attribute.getTags())
+                    .filter(this::isIdDigitCount)
+                    .findFirst()
                     .map(Tag::getValue)
-                    .flatMap(
-                        idPrefix ->
-                            tryFind(
-                                    tags,
-                                    tag -> hasIDDigitCount.getIRI().equals(tag.getRelationIri()))
-                                .toJavaUtil()
-                                .map(Tag::getValue)
-                                .map(Integer::parseInt)
-                                .map("0"::repeat)
-                                .map(zeroes -> idPrefix + zeroes))
-                    .map(DecimalFormat::new)
-                    .map(format -> format.format((int) sequences.generateId(attribute))));
+                    .map(Integer::parseInt)
+                    .map("0"::repeat)
+                    .map(zeroes -> idPrefix + zeroes))
+        .map(DecimalFormat::new)
+        .map(
+            format -> {
+              int sequence = (int) sequences.generateId(attribute);
+              if (stream(attribute.getTags()).anyMatch(this::isScrambled)) {
+                var scrambler = IntScrambler.forDecimalFormat(format);
+                return format.format(scrambler.scramble(sequence));
+              } else {
+                return format.format(sequence);
+              }
+            });
+  }
+
+  private boolean isScrambled(Tag tag) {
+    return type.getIRI().equals(tag.getRelationIri())
+        && SCRAMBLED.toString().equals(tag.getObjectIri());
+  }
+
+  private boolean isIdPrefix(Tag tag) {
+    return hasIDPrefix.getIRI().equals(tag.getRelationIri());
+  }
+
+  private boolean isIdDigitCount(Tag tag) {
+    return hasIDDigitCount.getIRI().equals(tag.getRelationIri());
   }
 
   private static void generateAutoDateOrDateTime(

diff --git a/molgenis-data/src/main/java/org/molgenis/data/semantic/Vocabulary.java b/molgenis-data/src/main/java/org/molgenis/data/semantic/Vocabulary.java
@@ -10,6 +10,9 @@ private Vocabulary() {}
   public static final IRI CASE_SENSITIVE =
       SimpleValueFactory.getInstance().createIRI("http://purl.obolibrary.org/obo/NCIT_C71490");
 
+  public static final IRI SCRAMBLED =
+      SimpleValueFactory.getInstance().createIRI("http://molgenis.org#scrambled");
+
   public static final IRI AUDIT_USAGE =
       SimpleValueFactory.getInstance().createIRI("http://molgenis.org/audit#usage");
 }
diff --git a/molgenis-data/src/test/java/org/molgenis/data/populate/AutoValuePopulatorTest.java b/molgenis-data/src/test/java/org/molgenis/data/populate/AutoValuePopulatorTest.java
@@ -12,6 +12,8 @@
 import static org.molgenis.data.meta.AttributeType.STRING;
 import static org.molgenis.data.semantic.Relation.hasIDDigitCount;
 import static org.molgenis.data.semantic.Relation.hasIDPrefix;
+import static org.molgenis.data.semantic.Relation.type;
+import static org.molgenis.data.semantic.Vocabulary.SCRAMBLED;
 
 import java.util.List;
 import org.junit.jupiter.api.BeforeEach;
@@ -44,6 +46,7 @@ class AutoValuePopulatorTest {
   @Mock private IdGenerator idGenerator;
   @Mock private Tag idDigitCountTag;
   @Mock private Tag idPrefixTag;
+  @Mock private Tag scrambledTag;
 
   @BeforeEach
   void setUpBeforeMethod() {
@@ -133,4 +136,21 @@ void populateStringFromSequence() {
 
     assertEquals("GEN-0000123", entity.getString(attrId));
   }
+
+  @Test
+  void populateStringFromSequenceScrambled() {
+    when(attrId.getTags()).thenReturn(List.of(idDigitCountTag, idPrefixTag, scrambledTag));
+    when(idDigitCountTag.getRelationIri()).thenReturn(hasIDDigitCount.getIRI());
+    when(idDigitCountTag.getValue()).thenReturn("7");
+    when(idPrefixTag.getRelationIri()).thenReturn(hasIDPrefix.getIRI());
+    when(idPrefixTag.getValue()).thenReturn("GEN-");
+    when(scrambledTag.getObjectIri()).thenReturn(SCRAMBLED.toString());
+    when(scrambledTag.getRelationIri()).thenReturn(type.getIRI());
+    when(sequences.generateId(attrId)).thenReturn(1L);
+
+    Entity entity = new DynamicEntity(entityType);
+    autoValuePopulator.populate(entity);
+
+    assertEquals("GEN-5184445", entity.getString(attrId));
+  }
 }
diff --git a/molgenis-util/src/main/java/org/molgenis/util/IntScrambler.java b/molgenis-util/src/main/java/org/molgenis/util/IntScrambler.java
@@ -41,6 +41,7 @@ public class IntScrambler {
     a = aCandidate;
   }
 
+  @SuppressWarnings("ResultOfMethodCallIgnored")
   public static IntScrambler forDecimalFormat(DecimalFormat decimalFormat) {
     var matcher = Pattern.compile("0+").matcher(decimalFormat.toPattern());
     matcher.find();