Skip to content

Commit

Permalink
feat(unit-number-extractor): mapper stream to separate concatenated u…
Browse files Browse the repository at this point in the history
…nit numbers (#502)

* feat(refactor): update documentMapper to allow easier extensions via external mappers

* feat(unit-number-extractor): add new mapper stream to separate concatenated unit numbers
  • Loading branch information
missinglink authored Feb 7, 2022
1 parent 5095f9d commit f3331af
Show file tree
Hide file tree
Showing 12 changed files with 792 additions and 65 deletions.
34 changes: 18 additions & 16 deletions lib/streams/documentStream.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
const through = require( 'through2' );
const peliasModel = require( 'pelias-model' );

// examples: GAACT718519668, GASA_424005553
const GNAF_PID_PATTERN = /^(GA)(NSW|VIC|QLD|SA_|WA_|TAS|NT_|ACT|OT_)([0-9]{9})$/;
// patter to match a two character country code from the directory prefix
const COUNTRY_CODE_PATTERN = /^([A-Za-z]{2})\//;

/*
* Create a stream of Documents from valid, cleaned CSV records
Expand All @@ -22,26 +22,28 @@ function createDocumentStream(id_prefix, stats) {
uid++;

try {
const addrDoc = new peliasModel.Document( 'openaddresses', 'address', model_id )
.setName( 'default', (record.NUMBER + ' ' + record.STREET) )
.setCentroid( { lon: record.LON, lat: record.LAT } );

addrDoc.setAddress( 'number', record.NUMBER );

addrDoc.setAddress( 'street', record.STREET );
const doc = new peliasModel.Document('openaddresses', 'address', model_id)
.setName('default', `${record.NUMBER} ${record.STREET}`)
.setAddress('number', record.NUMBER)
.setAddress('street', record.STREET)
.setCentroid({ lon: record.LON, lat: record.LAT });

if (record.POSTCODE) {
addrDoc.setAddress( 'zip', record.POSTCODE );
doc.setAddress('zip', record.POSTCODE);
}

// detect Australian G-NAF PID concordances
if (id_prefix.startsWith('au/')) {
if (record.ID.length === 14 && record.ID.match(GNAF_PID_PATTERN)) {
addrDoc.setAddendum('concordances', {'gnaf:pid': record.ID});
}
// attempt to set the country code based on the directory prefix
const match = id_prefix.match(COUNTRY_CODE_PATTERN);
if (match && match[1]) {
doc.setMeta('country_code', match[1].toUpperCase());
}

this.push( addrDoc );
// store a reference to the original OA record in a 'meta'
// field, this is available through the pipeline but is not
// saved to elasticsearch.
doc.setMeta('oa', record);

this.push(doc);
}
catch ( ex ){
stats.badRecordCount++;
Expand Down
34 changes: 34 additions & 0 deletions lib/streams/gnafMapperStream.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/**
The GNAF mapper is responsible for extracting Australian GNAF
identifiers from the OA 'ID' property, where available.
**/

const _ = require('lodash');
const through = require('through2');
const logger = require('pelias-logger').get('openaddresses');

// examples: GAACT718519668, GASA_424005553
const GNAF_PID_PATTERN = /^(GA)(NSW|VIC|QLD|SA_|WA_|TAS|NT_|ACT|OT_)([0-9]{9})$/;

module.exports = function () {
return through.obj((doc, enc, next) => {
try {
if (doc.getMeta('country_code') === 'AU') {

// detect Australian G-NAF PID concordances
const oaid = _.get(doc.getMeta('oa'), 'ID');
if (oaid.length === 14 && oaid.match(GNAF_PID_PATTERN)) {
doc.setAddendum('concordances', { 'gnaf:pid': oaid });
}
}
}

catch (e) {
logger.error('gnaf_mapper error');
logger.error(e.stack);
logger.error(JSON.stringify(doc, null, 2));
}

return next(null, doc);
});
};
6 changes: 5 additions & 1 deletion lib/streams/recordStream.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ const CleanupStream = require('./cleanupStream');
const ContentHashStream = require('./contentHashStream');
const ValidRecordFilterStream = require('./validRecordFilterStream');
const DocumentStream = require('./documentStream');
const gnafMapperStreamFactory = require('./gnafMapperStream');
const unitSplittingMapperStreamFactory = require('./unitSplittingMapperStream');

/*
* Construct a suitable id prefix for a CSV file given
Expand Down Expand Up @@ -63,7 +65,9 @@ function createRecordStream( filePath, dirPath ){
.pipe( contentHashStream )
.pipe( validRecordFilterStream )
.pipe( cleanupStream )
.pipe( documentStream );
.pipe( documentStream )
.pipe( gnafMapperStreamFactory() )
.pipe( unitSplittingMapperStreamFactory() );
}

function geojsonStream(stream) {
Expand Down
73 changes: 73 additions & 0 deletions lib/streams/unitSplittingMapperStream.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/**
The unit splitting mapper is responsible for detecting when the address.number
field contains the concatenation of the unit and the housenumber.
eg. Flat 2 14 Smith St
In this case we attempt to split the two terms into their consituent parts.
note: Addressing formats vary between countries, it's unlikely that a pattern
which works for one country will also work internationally. For this reason this
mapper accepts a country code which can be used to select the appropriate pattern(s).
Feel free to make changes to this mapping file!
**/

const _ = require('lodash');
const through = require('through2');
const logger = require('pelias-logger').get('openaddresses');
const mappers = {};

// Australasian Unit Number Mapper
// https://auspost.com.au/content/dam/auspost_corp/media/documents/Appendix-01.pdf
// https://www.nzpost.co.nz/sites/nz/files/2021-10/adv358-address-standards.pdf
const australasian = (doc) =>{
const number = doc.getAddress('number');
if(!_.isString(number) || number.length < 3){ return; }

// 2/14
const solidus = number.match(/^(\d+)\s*\/\s*(\d+)$/);
if (solidus) {
doc.setAddress('unit', solidus[1]);
doc.setAddress('number', solidus[2]);
return;
}

// Flat 2 14 | F 2 14 | Unit 2 14 | APT 2 14
const verbose = number.match(/^(flat|f|unit|apartment|apt)\s*(\d+)\s+(\d+)$/i);
if (verbose) {
doc.setAddress('unit', verbose[2]);
doc.setAddress('number', verbose[3]);
return;
}
};

// associate mappers with country codes
mappers.AU = australasian;
mappers.NZ = australasian;

module.exports = function () {
return through.obj((doc, enc, next) => {
try {
// only applies to records with a 'number' set and no 'unit' set (yet).
if (doc.hasAddress('number') && !doc.hasAddress('unit')) {

// select the appropriate mapper based on country code
const mapper = _.get(mappers, doc.getMeta('country_code'));
if (_.isFunction(mapper)) {

// run the country-specific mapper
mapper(doc);
}
}
}

catch (e) {
logger.error('unit_mapper error');
logger.error(e.stack);
logger.error(JSON.stringify(doc, null, 2));
}

return next(null, doc);
});
};
2 changes: 2 additions & 0 deletions test/data/au/input_file_3.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID
144.931874,-37.791488,10,Smith Street,,input city,input district,input region,input postcode,GAVIC718519668
11 changes: 11 additions & 0 deletions test/data/au/input_file_4.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
LON,LAT,HASH,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID
144.9804144,-37.8723977,710daac656ffd0c3,10/244,BARKLY STREET,,ST KILDA,,VIC,"3182","50579518"
145.0378718,-37.8637847,92862c98c20bbe3d,10/244-246,WATTLETREE ROAD,,MALVERN,,VIC,"3144","208518759"
145.0003807,-37.8289596,d0a21035cebcd8ab,10/244-246,MARY STREET,,RICHMOND,,VIC,"3121","51463974"
144.978361,-37.8002503,4e891155eb009dc3,10/244,BRUNSWICK STREET,,FITZROY,,VIC,"3065","210464257"
144.9591621,-37.8331898,e20c57c01d5d42c0,110/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672310"
144.9591621,-37.8331898,50c85f85cce9181f,210/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672321"
144.9591621,-37.8331898,4e737a8cc6ada9ec,310/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672332"
144.9591621,-37.8331898,d6ed0494e8c53ff8,410/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672343"
144.9591621,-37.8331898,fa0691071a173dab,510/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672353"
144.925714,-37.7516895,00be263cea28bea0,10/244,PASCOE VALE ROAD,,ESSENDON,,VIC,"3040","429232726"
Loading

0 comments on commit f3331af

Please sign in to comment.