using System.Text.RegularExpressions; namespace Core.Utility; public static class CareerNormalizer { /// /// Normalizes career names from multiline text input. /// Strips bullet points, trims whitespace, and returns distinct normalized names. /// /// Multiline text input containing career names /// Collection of normalized career names (trimmed, with bullets removed) public static IEnumerable NormalizeCareerNames(string? input) { if (string.IsNullOrWhiteSpace(input)) { return Enumerable.Empty(); } return input .Split(new[] { "\r\n", "\r", "\n" }, StringSplitOptions.None) .Select(line => NormalizeSingleCareerName(line)) .Where(name => !string.IsNullOrWhiteSpace(name)) .Distinct(StringComparer.OrdinalIgnoreCase) .ToList(); } /// /// Normalizes a single career name by stripping bullet points and trimming whitespace. /// /// The career name to normalize /// Normalized career name private static string NormalizeSingleCareerName(string careerName) { if (string.IsNullOrWhiteSpace(careerName)) { return string.Empty; } // Remove common bullet point characters (•, -, *, etc.) and trim var normalized = Regex.Replace(careerName.Trim(), @"^[\u2022\u2023\u25E6\u2043\u2219\-\*\•]\s*", string.Empty, RegexOptions.Compiled); return normalized.Trim(); } /// /// Finds or creates a career name for case-insensitive duplicate detection. /// Returns the normalized (lowercase) version for comparison. /// /// The career name to normalize for comparison /// Lowercase normalized name for duplicate detection public static string GetNormalizedKey(string careerName) { return NormalizeSingleCareerName(careerName).ToLowerInvariant(); } }