Source code

Revision control

Copy as Markdown

Other Tools

/* eslint-disable no-useless-concat */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
// NamedCaptureGroup class represents a named capturing group in a regular expression
class NamedCaptureGroup {
// The named of this capturing group
#name = null;
// The capturing group
#capture = null;
// The matched result
#match = null;
constructor(name, capture) {
this.#name = name;
this.#capture = capture;
}
get name() {
return this.#name;
}
get capture() {
return this.#capture;
}
get match() {
return this.#match;
}
// Setter for the matched result based on the match groups
setMatch(matchGroups) {
this.#match = matchGroups[this.#name];
}
}
// Base class for different part of a street address regular expression.
// The regular expression is constructed with prefix, pattern, suffix
// and separator to extract "value" part.
// For examplem, when we write "apt 4." to for floor number, its prefix is `apt`,
// suffix is `.` and value to represent apartment number is `4`.
class StreetAddressPartRegExp extends NamedCaptureGroup {
constructor(name, prefix, pattern, suffix, sep, optional = false) {
prefix = prefix ?? "";
suffix = suffix ?? "";
super(
name,
`((?:${prefix})(?<${name}>${pattern})(?:${suffix})(?:${sep})+)${
optional ? "?" : ""
}`
);
}
}
// A regular expression to match the street number portion of a street address,
class StreetNumberRegExp extends StreetAddressPartRegExp {
static PREFIX = "((no|°|º|number)(\\.|-|\\s)*)?"; // From chromium source
static PATTERN = "\\d+\\w?";
// TODO: possible suffix : (th\\.|\\.)?
static SUFFIX = null;
constructor(sep, optional) {
super(
StreetNumberRegExp.name,
StreetNumberRegExp.PREFIX,
StreetNumberRegExp.PATTERN,
StreetNumberRegExp.SUFFIX,
sep,
optional
);
}
}
// A regular expression to match the street name portion of a street address,
class StreetNameRegExp extends StreetAddressPartRegExp {
static PREFIX = null;
static PATTERN = "(?:[^\\s,]+(?:[^\\S\\r\\n]+[^\\s,]+)*?)"; // From chromium source
// TODO: Should we consider suffix like (ave|st)?
static SUFFIX = null;
constructor(sep, optional) {
super(
StreetNameRegExp.name,
StreetNameRegExp.PREFIX,
StreetNameRegExp.PATTERN,
StreetNameRegExp.SUFFIX,
sep,
optional
);
}
}
// A regular expression to match the apartment number portion of a street address,
class ApartmentNumberRegExp extends StreetAddressPartRegExp {
static keyword = "apt|apartment|wohnung|apto|-" + "|unit|suite|ste|#|room"; // From chromium source // Firefox specific
static PREFIX = `(${ApartmentNumberRegExp.keyword})(\\.|\\s|-)*`;
static PATTERN = "\\w*([-|\\/]\\w*)?";
static SUFFIX = "(\\.|\\s|-)*(ª)?"; // From chromium source
constructor(sep, optional) {
super(
ApartmentNumberRegExp.name,
ApartmentNumberRegExp.PREFIX,
ApartmentNumberRegExp.PATTERN,
ApartmentNumberRegExp.SUFFIX,
sep,
optional
);
}
}
// A regular expression to match the floor number portion of a street address,
class FloorNumberRegExp extends StreetAddressPartRegExp {
static keyword =
"floor|flur|fl|og|obergeschoss|ug|untergeschoss|geschoss|andar|piso|º" + // From chromium source
"|level|lvl"; // Firefox specific
static PREFIX = `(${FloorNumberRegExp.keyword})?(\\.|\\s|-)*`; // TODO
static PATTERN = "\\d{1,3}\\w?";
static SUFFIX = `(st|nd|rd|th)?(\\.|\\s|-)*(${FloorNumberRegExp.keyword})?`; // TODO
constructor(sep, optional) {
super(
FloorNumberRegExp.name,
FloorNumberRegExp.PREFIX,
FloorNumberRegExp.PATTERN,
FloorNumberRegExp.SUFFIX,
sep,
optional
);
}
}
/**
* Class represents a street address with the following fields:
* - street number
* - street name
* - apartment number
* - floor number
*/
export class StructuredStreetAddress {
#street_number = null;
#street_name = null;
#apartment_number = null;
#floor_number = null;
constructor(street_number, street_name, apartment_number, floor_number) {
this.#street_number = street_number?.toString();
this.#street_name = street_name?.toString();
this.#apartment_number = apartment_number?.toString();
this.#floor_number = floor_number?.toString();
}
get street_number() {
return this.#street_number;
}
get street_name() {
return this.#street_name;
}
get apartment_number() {
return this.#apartment_number;
}
get floor_number() {
return this.#floor_number;
}
toString() {
return `
street number: ${this.#street_number}\n
street name: ${this.#street_name}\n
apartment number: ${this.#apartment_number}\n
floor number: ${this.#floor_number}\n
`;
}
}
export class AddressParser {
/**
* Parse street address with the following pattern.
* street number, street name, apartment number(optional), floor number(optional)
* For example, 2 Harrison St #175 floor 2
*
* @param {string} address The street address to be parsed.
* @returns {StructuredStreetAddress}
*/
static parseStreetAddress(address) {
if (!address) {
return null;
}
const separator = "(\\s|,|$)";
const regexpes = [
new StreetNumberRegExp(separator),
new StreetNameRegExp(separator),
new ApartmentNumberRegExp(separator, true),
new FloorNumberRegExp(separator, true),
];
return AddressParser.parse(address, regexpes)
? new StructuredStreetAddress(...regexpes.map(regexp => regexp.match))
: null;
}
static parse(address, regexpes) {
const options = {
trim: true,
merge_whitespace: true,
ignore_case: true,
};
address = AddressParser.normalizeString(address, options);
const match = address.match(
new RegExp(`^(${regexpes.map(regexp => regexp.capture).join("")})$`)
);
if (!match) {
return null;
}
regexpes.forEach(regexp => regexp.setMatch(match.groups));
return regexpes.reduce((acc, current) => {
return { ...acc, [current.name]: current.match };
}, {});
}
static normalizeString(s, options) {
if (typeof s != "string") {
return s;
}
if (options.ignore_case) {
s = s.toLowerCase();
}
// process punctuation before whitespace because if a punctuation
// is replaced with whitespace, we might want to merge it later
if (options.remove_punctuation) {
s = AddressParser.replacePunctuation(s, "");
} else if ("replace_punctuation" in options) {
const replace = options.replace_punctuation;
s = AddressParser.replacePunctuation(s, replace);
}
// process whitespace
if (options.merge_whitespace) {
s = AddressParser.mergeWhitespace(s);
} else if (options.remove_whitespace) {
s = AddressParser.removeWhitespace(s);
}
return s.trim();
}
static replacePunctuation(s, replace) {
const regex = /\p{Punctuation}/gu;
return s?.replace(regex, replace);
}
static removePunctuation(s) {
return s?.replace(/[.,\/#!$%\^&\*;:{}=\-_~()]/g, "");
}
static replaceControlCharacters(s) {
return s?.replace(/[\t\n\r]/g, " ");
}
static removeWhitespace(s) {
return s?.replace(/[\s]/g, "");
}
static mergeWhitespace(s) {
return s?.replace(/\s{2,}/g, " ");
}
}