lruckman/BurteForceAddressNormalize.cs

## BurteForceAddressNormalize.cs

        /// <summary>
        /// Used to parse and normalize an address.
        /// </summary>
        /// <param name="addressline"></param>
        /// <param name="cityName"></param>
        /// <param name="stateCode"></param>
        /// <param name="postalCode"></param>
        /// <returns>A normalized string devoid of unit numbers</returns>
        internal string ParsedStreetAddress(string addressline, string cityName, string stateCode, string postalCode)
        {
            var punctuation = new Regex(@"[^\w\s#]+", RegexOptions.Compiled); // keep "#" so we can parse out apartments later
            var orphanedHashes = new Regex(@"#(\s)+", RegexOptions.Compiled); // need to standardize how hashes appear

            addressline = addressline ?? "";
            cityName = cityName ?? "";
            stateCode = stateCode ?? "";
            postalCode = postalCode ?? "";

            // lower it to simplify comparisons

            addressline = addressline.ToLowerInvariant();

            // remove punctuation

            addressline = punctuation.Replace(addressline, "");

            // make sure hashes aren't orphans

            addressline = orphanedHashes.Replace(addressline, "");

            // walk through and parse

            var segments = addressline.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
            var normalizedAddress = "";

            for (var i = 0; i < segments.Length; i++)
            {
                var segment = segments[i];

                // deal with apartment numbers or alternate input of unit numbers

                if (segment.StartsWith("#"))
                {
                    // a unit number

                    continue;
                }

                // remove subpremises

                var subpremises = new[] { "suite", "ste", "apt", "rm", "room", "bldg", "building", "route", "rte", "pavillion", "department", "dept", "unit" };

                if (subpremises.Any(subpremise => subpremise == segment))
                {
                    // we have a named subpremise

                    if ((i + 1) < segments.Length)
                    {
                        var nextSegment = segments[i + 1];

                        // be careful here too much and we skip valid data and too little we include unit numbers

                        if (nextSegment.Length < 5)
                        {
                            // advance past the next part, which should be the subpremise number
                            i++;
                        }
                    }

                    continue;
                }

                // floors are difficult

                if (segment == "floor" || segment == "fl" || segment == "flr")
                {
                    if (i != 0)
                    {
                        // typical format is ... 2ND FLOOR ....
                        // so we have to get the previous segment so we can remove it

                        var previousSegment = segments[i - 1];

                        // remove the previous segment from our current normalized string

                        normalizedAddress = normalizedAddress.RemoveFromEnd(previousSegment);
                    }

                    continue;
                }

                // normalize

                var normalize = new Dictionary<string, string>
                                {
                                    {"avenue","ave"},
                                    {"boulevard", "blvd"},
                                    {"circle", "cir"},
                                    {"court", "ct"},
                                    {"drive", "dr"},
                                    {"east", "e"},
                                    {"floor", "flr"},
                                    {"fl", "flr"},
                                    {"north", "n"},
                                    {"northeast","ne"},
                                    {"northwest", "nw"},
                                    {"place", "pl"},
                                    {"road", "rd"},
                                    {"route", "rte"},
                                    {"south", "s"},
                                    {"southeast", "se"},
                                    {"southwest", "sw"},
                                    {"sth", "s"},
                                    {"street", "st"},
                                    {"way", "wy"},
                                    {"west", "w"},
                                };

                if (normalize.Keys.Any(n => n == segment))
                {
                    var matches = normalize.Keys
                        .Where(n => n == segment)
                        .ToArray();

                    segment = matches
                        .Aggregate(segment, (current, match) => current.Replace(match, normalize[match]));
                }

                // let see if we can drop the direction indicator

                if (segment == "n" ||
                    segment == "ne" ||
                    segment == "nw" ||
                    segment == "e" ||
                    segment == "s" ||
                    segment == "se" ||
                    segment == "sw" ||
                    segment == "w")
                {
                    if (i > 2) // lets drop it since it's most likely not a street name
                    {
                        continue;
                    }
                }

                // add this bit to the parsed output

                normalizedAddress += segment;
            }

            if (postalCode.Length > 5)
            {
                postalCode = postalCode.Substring(0, 5);
            }

            normalizedAddress = string.Format("{0}{1}{2}{3}", normalizedAddress, cityName, stateCode, postalCode)
                .ToLowerInvariant();

            return new string(punctuation.Replace(normalizedAddress, "")
                .Where(c => !char.IsWhiteSpace(c))
                .ToArray());
        }

	/// <summary>
	/// Used to parse and normalize an address.
	/// </summary>
	/// <param name="addressline"></param>
	/// <param name="cityName"></param>
	/// <param name="stateCode"></param>
	/// <param name="postalCode"></param>
	/// <returns>A normalized string devoid of unit numbers</returns>
	internal string ParsedStreetAddress(string addressline, string cityName, string stateCode, string postalCode)
	{
	var punctuation = new Regex(@"[^\w\s#]+", RegexOptions.Compiled); // keep "#" so we can parse out apartments later
	var orphanedHashes = new Regex(@"#(\s)+", RegexOptions.Compiled); // need to standardize how hashes appear

	addressline = addressline ?? "";
	cityName = cityName ?? "";
	stateCode = stateCode ?? "";
	postalCode = postalCode ?? "";

	// lower it to simplify comparisons

	addressline = addressline.ToLowerInvariant();

	// remove punctuation

	addressline = punctuation.Replace(addressline, "");

	// make sure hashes aren't orphans

	addressline = orphanedHashes.Replace(addressline, "");

	// walk through and parse

	var segments = addressline.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
	var normalizedAddress = "";

	for (var i = 0; i < segments.Length; i++)
	{
	var segment = segments[i];

	// deal with apartment numbers or alternate input of unit numbers

	if (segment.StartsWith("#"))
	{
	// a unit number

	continue;
	}

	// remove subpremises

	var subpremises = new[] { "suite", "ste", "apt", "rm", "room", "bldg", "building", "route", "rte", "pavillion", "department", "dept", "unit" };

	if (subpremises.Any(subpremise => subpremise == segment))
	{
	// we have a named subpremise

	if ((i + 1) < segments.Length)
	{
	var nextSegment = segments[i + 1];

	// be careful here too much and we skip valid data and too little we include unit numbers

	if (nextSegment.Length < 5)
	{
	// advance past the next part, which should be the subpremise number
	i++;
	}
	}

	continue;
	}

	// floors are difficult

	if (segment == "floor" \|\| segment == "fl" \|\| segment == "flr")
	{
	if (i != 0)
	{
	// typical format is ... 2ND FLOOR ....
	// so we have to get the previous segment so we can remove it

	var previousSegment = segments[i - 1];

	// remove the previous segment from our current normalized string

	normalizedAddress = normalizedAddress.RemoveFromEnd(previousSegment);
	}

	continue;
	}

	// normalize

	var normalize = new Dictionary<string, string>
	{
	{"avenue","ave"},
	{"boulevard", "blvd"},
	{"circle", "cir"},
	{"court", "ct"},
	{"drive", "dr"},
	{"east", "e"},
	{"floor", "flr"},
	{"fl", "flr"},
	{"north", "n"},
	{"northeast","ne"},
	{"northwest", "nw"},
	{"place", "pl"},
	{"road", "rd"},
	{"route", "rte"},
	{"south", "s"},
	{"southeast", "se"},
	{"southwest", "sw"},
	{"sth", "s"},
	{"street", "st"},
	{"way", "wy"},
	{"west", "w"},
	};

	if (normalize.Keys.Any(n => n == segment))
	{
	var matches = normalize.Keys
	.Where(n => n == segment)
	.ToArray();

	segment = matches
	.Aggregate(segment, (current, match) => current.Replace(match, normalize[match]));
	}

	// let see if we can drop the direction indicator

	if (segment == "n" \|\|
	segment == "ne" \|\|
	segment == "nw" \|\|
	segment == "e" \|\|
	segment == "s" \|\|
	segment == "se" \|\|
	segment == "sw" \|\|
	segment == "w")
	{
	if (i > 2) // lets drop it since it's most likely not a street name
	{
	continue;
	}
	}

	// add this bit to the parsed output

	normalizedAddress += segment;
	}

	if (postalCode.Length > 5)
	{
	postalCode = postalCode.Substring(0, 5);
	}

	normalizedAddress = string.Format("{0}{1}{2}{3}", normalizedAddress, cityName, stateCode, postalCode)
	.ToLowerInvariant();

	return new string(punctuation.Replace(normalizedAddress, "")
	.Where(c => !char.IsWhiteSpace(c))
	.ToArray());
	}