using System.Text.RegularExpressions;
namespace Core.Utility;
public static class CareerNormalizer
{
///
/// Normalizes career names from multiline text input.
/// Strips bullet points, trims whitespace, and returns distinct normalized names.
///
/// Multiline text input containing career names
/// Collection of normalized career names (trimmed, with bullets removed)
public static IEnumerable NormalizeCareerNames(string? input)
{
if (string.IsNullOrWhiteSpace(input))
{
return Enumerable.Empty();
}
return input
.Split(new[] { "\r\n", "\r", "\n" }, StringSplitOptions.None)
.Select(line => NormalizeSingleCareerName(line))
.Where(name => !string.IsNullOrWhiteSpace(name))
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
}
///
/// Normalizes a single career name by stripping bullet points and trimming whitespace.
///
/// The career name to normalize
/// Normalized career name
private static string NormalizeSingleCareerName(string careerName)
{
if (string.IsNullOrWhiteSpace(careerName))
{
return string.Empty;
}
// Remove common bullet point characters (•, -, *, etc.) and trim
var normalized = Regex.Replace(careerName.Trim(), @"^[\u2022\u2023\u25E6\u2043\u2219\-\*\•]\s*", string.Empty, RegexOptions.Compiled);
return normalized.Trim();
}
///
/// Finds or creates a career name for case-insensitive duplicate detection.
/// Returns the normalized (lowercase) version for comparison.
///
/// The career name to normalize for comparison
/// Lowercase normalized name for duplicate detection
public static string GetNormalizedKey(string careerName)
{
return NormalizeSingleCareerName(careerName).ToLowerInvariant();
}
}