Last active
August 29, 2015 14:27
-
-
Save lruckman/556ca8154aa1329031f2 to your computer and use it in GitHub Desktop.
Brute force address normalization for grouping. Not an exact science but works fairly well for addresses in the US. Use at your own risk.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// <summary> | |
/// Used to parse and normalize an address. | |
/// </summary> | |
/// <param name="addressline"></param> | |
/// <param name="cityName"></param> | |
/// <param name="stateCode"></param> | |
/// <param name="postalCode"></param> | |
/// <returns>A normalized string devoid of unit numbers</returns> | |
internal string ParsedStreetAddress(string addressline, string cityName, string stateCode, string postalCode) | |
{ | |
var punctuation = new Regex(@"[^\w\s#]+", RegexOptions.Compiled); // keep "#" so we can parse out apartments later | |
var orphanedHashes = new Regex(@"#(\s)+", RegexOptions.Compiled); // need to standardize how hashes appear | |
addressline = addressline ?? ""; | |
cityName = cityName ?? ""; | |
stateCode = stateCode ?? ""; | |
postalCode = postalCode ?? ""; | |
// lower it to simplify comparisons | |
addressline = addressline.ToLowerInvariant(); | |
// remove punctuation | |
addressline = punctuation.Replace(addressline, ""); | |
// make sure hashes aren't orphans | |
addressline = orphanedHashes.Replace(addressline, ""); | |
// walk through and parse | |
var segments = addressline.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); | |
var normalizedAddress = ""; | |
for (var i = 0; i < segments.Length; i++) | |
{ | |
var segment = segments[i]; | |
// deal with apartment numbers or alternate input of unit numbers | |
if (segment.StartsWith("#")) | |
{ | |
// a unit number | |
continue; | |
} | |
// remove subpremises | |
var subpremises = new[] { "suite", "ste", "apt", "rm", "room", "bldg", "building", "route", "rte", "pavillion", "department", "dept", "unit" }; | |
if (subpremises.Any(subpremise => subpremise == segment)) | |
{ | |
// we have a named subpremise | |
if ((i + 1) < segments.Length) | |
{ | |
var nextSegment = segments[i + 1]; | |
// be careful here too much and we skip valid data and too little we include unit numbers | |
if (nextSegment.Length < 5) | |
{ | |
// advance past the next part, which should be the subpremise number | |
i++; | |
} | |
} | |
continue; | |
} | |
// floors are difficult | |
if (segment == "floor" || segment == "fl" || segment == "flr") | |
{ | |
if (i != 0) | |
{ | |
// typical format is ... 2ND FLOOR .... | |
// so we have to get the previous segment so we can remove it | |
var previousSegment = segments[i - 1]; | |
// remove the previous segment from our current normalized string | |
normalizedAddress = normalizedAddress.RemoveFromEnd(previousSegment); | |
} | |
continue; | |
} | |
// normalize | |
var normalize = new Dictionary<string, string> | |
{ | |
{"avenue","ave"}, | |
{"boulevard", "blvd"}, | |
{"circle", "cir"}, | |
{"court", "ct"}, | |
{"drive", "dr"}, | |
{"east", "e"}, | |
{"floor", "flr"}, | |
{"fl", "flr"}, | |
{"north", "n"}, | |
{"northeast","ne"}, | |
{"northwest", "nw"}, | |
{"place", "pl"}, | |
{"road", "rd"}, | |
{"route", "rte"}, | |
{"south", "s"}, | |
{"southeast", "se"}, | |
{"southwest", "sw"}, | |
{"sth", "s"}, | |
{"street", "st"}, | |
{"way", "wy"}, | |
{"west", "w"}, | |
}; | |
if (normalize.Keys.Any(n => n == segment)) | |
{ | |
var matches = normalize.Keys | |
.Where(n => n == segment) | |
.ToArray(); | |
segment = matches | |
.Aggregate(segment, (current, match) => current.Replace(match, normalize[match])); | |
} | |
// let see if we can drop the direction indicator | |
if (segment == "n" || | |
segment == "ne" || | |
segment == "nw" || | |
segment == "e" || | |
segment == "s" || | |
segment == "se" || | |
segment == "sw" || | |
segment == "w") | |
{ | |
if (i > 2) // lets drop it since it's most likely not a street name | |
{ | |
continue; | |
} | |
} | |
// add this bit to the parsed output | |
normalizedAddress += segment; | |
} | |
if (postalCode.Length > 5) | |
{ | |
postalCode = postalCode.Substring(0, 5); | |
} | |
normalizedAddress = string.Format("{0}{1}{2}{3}", normalizedAddress, cityName, stateCode, postalCode) | |
.ToLowerInvariant(); | |
return new string(punctuation.Replace(normalizedAddress, "") | |
.Where(c => !char.IsWhiteSpace(c)) | |
.ToArray()); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment