Skip to content

Instantly share code, notes, and snippets.

@lruckman
Last active August 29, 2015 14:27
Show Gist options
  • Save lruckman/556ca8154aa1329031f2 to your computer and use it in GitHub Desktop.
Save lruckman/556ca8154aa1329031f2 to your computer and use it in GitHub Desktop.
Brute force address normalization for grouping. Not an exact science but works fairly well for addresses in the US. Use at your own risk.
/// <summary>
/// Used to parse and normalize an address.
/// </summary>
/// <param name="addressline"></param>
/// <param name="cityName"></param>
/// <param name="stateCode"></param>
/// <param name="postalCode"></param>
/// <returns>A normalized string devoid of unit numbers</returns>
internal string ParsedStreetAddress(string addressline, string cityName, string stateCode, string postalCode)
{
var punctuation = new Regex(@"[^\w\s#]+", RegexOptions.Compiled); // keep "#" so we can parse out apartments later
var orphanedHashes = new Regex(@"#(\s)+", RegexOptions.Compiled); // need to standardize how hashes appear
addressline = addressline ?? "";
cityName = cityName ?? "";
stateCode = stateCode ?? "";
postalCode = postalCode ?? "";
// lower it to simplify comparisons
addressline = addressline.ToLowerInvariant();
// remove punctuation
addressline = punctuation.Replace(addressline, "");
// make sure hashes aren't orphans
addressline = orphanedHashes.Replace(addressline, "");
// walk through and parse
var segments = addressline.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
var normalizedAddress = "";
for (var i = 0; i < segments.Length; i++)
{
var segment = segments[i];
// deal with apartment numbers or alternate input of unit numbers
if (segment.StartsWith("#"))
{
// a unit number
continue;
}
// remove subpremises
var subpremises = new[] { "suite", "ste", "apt", "rm", "room", "bldg", "building", "route", "rte", "pavillion", "department", "dept", "unit" };
if (subpremises.Any(subpremise => subpremise == segment))
{
// we have a named subpremise
if ((i + 1) < segments.Length)
{
var nextSegment = segments[i + 1];
// be careful here too much and we skip valid data and too little we include unit numbers
if (nextSegment.Length < 5)
{
// advance past the next part, which should be the subpremise number
i++;
}
}
continue;
}
// floors are difficult
if (segment == "floor" || segment == "fl" || segment == "flr")
{
if (i != 0)
{
// typical format is ... 2ND FLOOR ....
// so we have to get the previous segment so we can remove it
var previousSegment = segments[i - 1];
// remove the previous segment from our current normalized string
normalizedAddress = normalizedAddress.RemoveFromEnd(previousSegment);
}
continue;
}
// normalize
var normalize = new Dictionary<string, string>
{
{"avenue","ave"},
{"boulevard", "blvd"},
{"circle", "cir"},
{"court", "ct"},
{"drive", "dr"},
{"east", "e"},
{"floor", "flr"},
{"fl", "flr"},
{"north", "n"},
{"northeast","ne"},
{"northwest", "nw"},
{"place", "pl"},
{"road", "rd"},
{"route", "rte"},
{"south", "s"},
{"southeast", "se"},
{"southwest", "sw"},
{"sth", "s"},
{"street", "st"},
{"way", "wy"},
{"west", "w"},
};
if (normalize.Keys.Any(n => n == segment))
{
var matches = normalize.Keys
.Where(n => n == segment)
.ToArray();
segment = matches
.Aggregate(segment, (current, match) => current.Replace(match, normalize[match]));
}
// let see if we can drop the direction indicator
if (segment == "n" ||
segment == "ne" ||
segment == "nw" ||
segment == "e" ||
segment == "s" ||
segment == "se" ||
segment == "sw" ||
segment == "w")
{
if (i > 2) // lets drop it since it's most likely not a street name
{
continue;
}
}
// add this bit to the parsed output
normalizedAddress += segment;
}
if (postalCode.Length > 5)
{
postalCode = postalCode.Substring(0, 5);
}
normalizedAddress = string.Format("{0}{1}{2}{3}", normalizedAddress, cityName, stateCode, postalCode)
.ToLowerInvariant();
return new string(punctuation.Replace(normalizedAddress, "")
.Where(c => !char.IsWhiteSpace(c))
.ToArray());
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment