Skip to content

Commit

Permalink
Merge pull request molgenis#9352 from tommydeboer/feat/scrambler
Browse files Browse the repository at this point in the history
Auto id pattern scrambler
  • Loading branch information
jelmerveen authored Nov 25, 2021
2 parents d319250 + 58d25d3 commit f4b655c
Showing 7 changed files with 98 additions and 20 deletions.
19 changes: 19 additions & 0 deletions docs/guide-pseudonymisation.md
Original file line number Diff line number Diff line change
@@ -11,6 +11,19 @@ GEN-0000003
etc.
```

Besides incrementing sequentially, the digit part of th ids can also be scrambled:

```
GEN-5720385
GEN-1398822
GEN-9401776
etc.
```
These are not random: the generated ids are guaranteed to be unique and are based on the incrementing sequence.

> Note: Be aware that the scrambled identifiers _will_ repeat when all possibilities are exhausted. So
> make sure that the length of the digit-part is sufficient for your use case!
## How to configure
To configure an attribute as an incrementing identifier, first make sure the attribute has the following properties:
```
@@ -37,6 +50,12 @@ your EMX file:
|-----------|----------|----------|-------------|---------------------------------|
| pseudonym | string | false | AUTO | hasIDDigitCount7,hasIDPrefixGen |

If you want the identifiers to be scrambled, you should also add the `scrambled` tag to the attribute:

| name | dataType | nillable | idAttribute | tags |
|-----------|----------|----------|-------------|-------------------------------------------|
| pseudonym | string | false | AUTO | hasIDDigitCount7,hasIDPrefixGen,scrambled |

## Endpoints
To keep track of a sequence's current value, it is stored in the database. To interact
with a sequence there are two endpoints available:
Original file line number Diff line number Diff line change
@@ -6,6 +6,7 @@
import static org.molgenis.data.semantic.Relation.type;
import static org.molgenis.data.semantic.Vocabulary.AUDIT_USAGE;
import static org.molgenis.data.semantic.Vocabulary.CASE_SENSITIVE;
import static org.molgenis.data.semantic.Vocabulary.SCRAMBLED;

import java.util.List;
import org.molgenis.data.DataService;
@@ -43,8 +44,14 @@ public void populate() {
audited.setRelationIri(isAudited.getIRI());
audited.setRelationLabel(isAudited.getLabel());

Tag scrambled = tagFactory.create("scrambled");
scrambled.setLabel("Scrambled");
scrambled.setObjectIri(SCRAMBLED.toString());
scrambled.setRelationIri(type.getIRI());
scrambled.setRelationLabel(type.getLabel());

dataService
.getRepository(TAG, Tag.class)
.upsertBatch(List.of(isAToken, isCaseSensitive, audited));
.upsertBatch(List.of(isAToken, isCaseSensitive, audited, scrambled));
}
}
Original file line number Diff line number Diff line change
@@ -26,6 +26,7 @@ class TagPopulatorTest {
@Mock Tag token;
@Mock Tag caseSensitive;
@Mock Tag audited;
@Mock Tag scrambled;
@Mock Repository<Tag> tagRepository;

private TagPopulator tagPopulator;
@@ -40,12 +41,13 @@ public void testPopulate() {
when(tagFactory.create("token")).thenReturn(token);
when(tagFactory.create("case-sensitive")).thenReturn(caseSensitive);
when(tagFactory.create("audit-usage")).thenReturn(audited);
when(tagFactory.create("scrambled")).thenReturn(scrambled);

when(dataService.getRepository(TagMetadata.TAG, Tag.class)).thenReturn(tagRepository);

tagPopulator.populate();

verify(tagRepository).upsertBatch(List.of(token, caseSensitive, audited));
verify(tagRepository).upsertBatch(List.of(token, caseSensitive, audited, scrambled));
verify(token).setRelationIri(RDF.TYPE.toString());
verify(token).setObjectIri(XMLSchema.TOKEN.toString());

@@ -54,5 +56,8 @@ public void testPopulate() {

verify(audited).setRelationIri(isAudited.getIRI());
verify(audited).setObjectIri(Vocabulary.AUDIT_USAGE.toString());

verify(scrambled).setRelationIri(RDF.TYPE.toString());
verify(scrambled).setObjectIri(Vocabulary.SCRAMBLED.toString());
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.molgenis.data.populate;

import static com.google.common.collect.Iterables.tryFind;
import static com.google.common.collect.Streams.stream;
import static java.util.Collections.singletonList;
import static java.util.Objects.requireNonNull;
@@ -9,6 +8,8 @@
import static org.molgenis.data.meta.AttributeType.STRING;
import static org.molgenis.data.semantic.Relation.hasIDDigitCount;
import static org.molgenis.data.semantic.Relation.hasIDPrefix;
import static org.molgenis.data.semantic.Relation.type;
import static org.molgenis.data.semantic.Vocabulary.SCRAMBLED;

import java.text.DecimalFormat;
import java.time.Instant;
@@ -19,12 +20,14 @@
import org.molgenis.data.meta.AttributeType;
import org.molgenis.data.meta.model.Attribute;
import org.molgenis.data.meta.model.Tag;
import org.molgenis.util.IntScrambler;
import org.molgenis.util.UnexpectedEnumException;
import org.springframework.stereotype.Component;

/** Populate entity values for auto attributes */
@Component
public class AutoValuePopulator {

private final IdGenerator idGenerator;
private final Sequences sequences;

@@ -54,30 +57,50 @@ public void populate(Entity entity) {
}

/**
* Generates a new sequence ID if the attribute is tagged with ID prefix and ID digit count
* Generates a new sequence ID if the attribute is tagged with ID prefix and ID digit count. If
* the ID is also tagged with "scrambled", it will scramble the digit part.
*
* @param attribute the ID attribute
* @return formatted ID, in sequence with
*/
private Optional<String> generateFormattedSequenceId(Attribute attribute) {
return Optional.ofNullable(attribute.getTags())
return stream(attribute.getTags())
.filter(this::isIdPrefix)
.findFirst()
.map(Tag::getValue)
.flatMap(
tags ->
tryFind(tags, tag -> hasIDPrefix.getIRI().equals(tag.getRelationIri()))
.toJavaUtil()
idPrefix ->
stream(attribute.getTags())
.filter(this::isIdDigitCount)
.findFirst()
.map(Tag::getValue)
.flatMap(
idPrefix ->
tryFind(
tags,
tag -> hasIDDigitCount.getIRI().equals(tag.getRelationIri()))
.toJavaUtil()
.map(Tag::getValue)
.map(Integer::parseInt)
.map("0"::repeat)
.map(zeroes -> idPrefix + zeroes))
.map(DecimalFormat::new)
.map(format -> format.format((int) sequences.generateId(attribute))));
.map(Integer::parseInt)
.map("0"::repeat)
.map(zeroes -> idPrefix + zeroes))
.map(DecimalFormat::new)
.map(
format -> {
int sequence = (int) sequences.generateId(attribute);
if (stream(attribute.getTags()).anyMatch(this::isScrambled)) {
var scrambler = IntScrambler.forDecimalFormat(format);
return format.format(scrambler.scramble(sequence));
} else {
return format.format(sequence);
}
});
}

private boolean isScrambled(Tag tag) {
return type.getIRI().equals(tag.getRelationIri())
&& SCRAMBLED.toString().equals(tag.getObjectIri());
}

private boolean isIdPrefix(Tag tag) {
return hasIDPrefix.getIRI().equals(tag.getRelationIri());
}

private boolean isIdDigitCount(Tag tag) {
return hasIDDigitCount.getIRI().equals(tag.getRelationIri());
}

private static void generateAutoDateOrDateTime(
Original file line number Diff line number Diff line change
@@ -10,6 +10,9 @@ private Vocabulary() {}
public static final IRI CASE_SENSITIVE =
SimpleValueFactory.getInstance().createIRI("http://purl.obolibrary.org/obo/NCIT_C71490");

public static final IRI SCRAMBLED =
SimpleValueFactory.getInstance().createIRI("http://molgenis.org#scrambled");

public static final IRI AUDIT_USAGE =
SimpleValueFactory.getInstance().createIRI("http://molgenis.org/audit#usage");
}
Original file line number Diff line number Diff line change
@@ -12,6 +12,8 @@
import static org.molgenis.data.meta.AttributeType.STRING;
import static org.molgenis.data.semantic.Relation.hasIDDigitCount;
import static org.molgenis.data.semantic.Relation.hasIDPrefix;
import static org.molgenis.data.semantic.Relation.type;
import static org.molgenis.data.semantic.Vocabulary.SCRAMBLED;

import java.util.List;
import org.junit.jupiter.api.BeforeEach;
@@ -44,6 +46,7 @@ class AutoValuePopulatorTest {
@Mock private IdGenerator idGenerator;
@Mock private Tag idDigitCountTag;
@Mock private Tag idPrefixTag;
@Mock private Tag scrambledTag;

@BeforeEach
void setUpBeforeMethod() {
@@ -133,4 +136,21 @@ void populateStringFromSequence() {

assertEquals("GEN-0000123", entity.getString(attrId));
}

@Test
void populateStringFromSequenceScrambled() {
when(attrId.getTags()).thenReturn(List.of(idDigitCountTag, idPrefixTag, scrambledTag));
when(idDigitCountTag.getRelationIri()).thenReturn(hasIDDigitCount.getIRI());
when(idDigitCountTag.getValue()).thenReturn("7");
when(idPrefixTag.getRelationIri()).thenReturn(hasIDPrefix.getIRI());
when(idPrefixTag.getValue()).thenReturn("GEN-");
when(scrambledTag.getObjectIri()).thenReturn(SCRAMBLED.toString());
when(scrambledTag.getRelationIri()).thenReturn(type.getIRI());
when(sequences.generateId(attrId)).thenReturn(1L);

Entity entity = new DynamicEntity(entityType);
autoValuePopulator.populate(entity);

assertEquals("GEN-5184445", entity.getString(attrId));
}
}
Original file line number Diff line number Diff line change
@@ -41,6 +41,7 @@ public class IntScrambler {
a = aCandidate;
}

@SuppressWarnings("ResultOfMethodCallIgnored")
public static IntScrambler forDecimalFormat(DecimalFormat decimalFormat) {
var matcher = Pattern.compile("0+").matcher(decimalFormat.toPattern());
matcher.find();

0 comments on commit f4b655c

Please sign in to comment.