From 7ddc55f672c6e0b8f4320dded71ba0e24fbbd1c5 Mon Sep 17 00:00:00 2001 From: James Kolpack Date: Thu, 8 Jan 2026 08:46:11 -0500 Subject: [PATCH] Refactor event occurrence parsing to use dynamic month parsers and improve input normalization This commit replaces individual month parsers with a dynamic array of month names, enhancing maintainability. The EventOccurrenceParser has been updated to utilize this new structure, ensuring consistent parsing of month names. Additionally, input normalization has been improved by standardizing hyphen handling and ensuring that all relevant parsing methods utilize the sanitized input. This change streamlines the parsing process and enhances overall robustness. --- Core/Parsers/EventOccurrenceGrammar.cs | 98 ++++++++++++++------- Core/Parsers/EventOccurrenceParser.cs | 113 ++++++++++++------------- 2 files changed, 121 insertions(+), 90 deletions(-) diff --git a/Core/Parsers/EventOccurrenceGrammar.cs b/Core/Parsers/EventOccurrenceGrammar.cs index 96cc823..7755aa7 100644 --- a/Core/Parsers/EventOccurrenceGrammar.cs +++ b/Core/Parsers/EventOccurrenceGrammar.cs @@ -10,35 +10,27 @@ namespace Core.Parsers; /// public static class EventOccurrenceGrammar { - // Months - all 12 months supported - private static readonly Parser January = Parse.String("January").Text().Token(); - private static readonly Parser February = Parse.String("February").Text().Token(); - private static readonly Parser March = Parse.String("March").Text().Token(); - private static readonly Parser April = Parse.String("April").Text().Token(); - private static readonly Parser May = Parse.String("May").Text().Token(); - private static readonly Parser June = Parse.String("June").Text().Token(); - private static readonly Parser July = Parse.String("July").Text().Token(); - private static readonly Parser August = Parse.String("August").Text().Token(); - private static readonly Parser September = Parse.String("September").Text().Token(); - private static readonly Parser October = Parse.String("October").Text().Token(); - private static readonly Parser November = Parse.String("November").Text().Token(); - private static readonly Parser December = Parse.String("December").Text().Token(); + /// + /// Array of all month names in order (January through December). + /// This is the single source of truth for month names used throughout the parser. + /// + public static readonly string[] MonthNames = new[] + { + "January", "February", "March", "April", "May", "June", + "July", "August", "September", "October", "November", "December" + }; + + // Build month parsers dynamically from MonthNames array + private static readonly Parser[] MonthParsers = MonthNames + .Select(month => Parse.String(month).Text().Token()) + .ToArray(); /// /// Parser for month names (January through December). + /// Built dynamically from MonthNames array. /// - public static readonly Parser Month = January - .Or(February) - .Or(March) - .Or(April) - .Or(May) - .Or(June) - .Or(July) - .Or(August) - .Or(September) - .Or(October) - .Or(November) - .Or(December); + public static readonly Parser Month = MonthParsers + .Aggregate((current, next) => current.Or(next)); /// /// Parser for day of month (1-31, optional semicolon). @@ -66,9 +58,10 @@ public static class EventOccurrenceGrammar select $"{hour}:{(minute.IsDefined ? minute.Get() : "00")} {ampm}"; /// - /// Parser for hyphen characters (en-dash, hyphen, em-dash). + /// Parser for hyphen character. + /// Note: Input is assumed to be normalized (en-dash and em-dash converted to regular hyphen) via SanitizeInput. /// - public static readonly Parser Hyphen = Parse.Char('–').Or(Parse.Char('-')).Or(Parse.Char('—')); + public static readonly Parser Hyphen = Parse.Char('-'); /// /// Parser for time values, including ranges and special values (NOON, TBD). @@ -80,12 +73,13 @@ public static class EventOccurrenceGrammar from dash in Hyphen.Then(_ => Parse.WhiteSpace.Many()).Optional() from end in TimeValue.Or(Noon).Optional() select end.IsDefined - ? $"{start} – {end.Get()}" + ? $"{start} - {end.Get()}" : start ); /// - /// Parser for section headers: EventName [–-—] (MS|HS). + /// Parser for section headers: EventName - (MS|HS). + /// Note: Input is assumed to be normalized (hyphens normalized) via SanitizeInput. /// public static readonly Parser<(string EventName, string SchoolLevel)> SectionHeader = from eventName in Parse.AnyChar.Except(Hyphen).Many().Text().Token() @@ -149,5 +143,51 @@ public static class EventOccurrenceGrammar { return line.TrimStart().StartsWith("#", StringComparison.Ordinal); } + + /// + /// Attempts to parse an occurrence line from the given text. + /// Returns null if parsing fails. + /// Strategy: Find the first month name in the line, then parse from there. + /// + public static (string Name, string Month, int Day, string TimeAndLocation)? TryParseOccurrenceLine(string line) + { + // Find the first occurrence of any month name (using normalized MonthNames array) + int monthIndex = -1; + string foundMonth = string.Empty; + + foreach (var month in MonthNames) + { + var index = line.IndexOf(month, StringComparison.OrdinalIgnoreCase); + if (index >= 0 && (monthIndex < 0 || index < monthIndex)) + { + monthIndex = index; + foundMonth = month; + } + } + + if (monthIndex < 0) + return null; + + // Extract name (everything before the month) + var name = line.Substring(0, monthIndex).Trim(); + + // Parse from the month onwards + var restOfLine = line.Substring(monthIndex); + try + { + var monthParser = Parse.String(foundMonth).Text().Token(); + var result = from month in monthParser + from day in DayOfMonth.Token() + from timeAndLocation in Parse.AnyChar.Many().Text() + select (name, month, day, timeAndLocation.Trim()); + + var parsed = result.Parse(restOfLine); + return parsed; + } + catch + { + return null; + } + } } diff --git a/Core/Parsers/EventOccurrenceParser.cs b/Core/Parsers/EventOccurrenceParser.cs index b77ea83..284a698 100644 --- a/Core/Parsers/EventOccurrenceParser.cs +++ b/Core/Parsers/EventOccurrenceParser.cs @@ -27,25 +27,17 @@ public class EventOccurrenceParser _locationConfig = locationConfig; } - private Regex _re = - new ( - @"" + // - @"(?^[^#].*)\s" + - @"(?January|February|March|April|May|June|July|August|September|October|November|December)\s" + - @"(?\d{1,2});?\s" + - @"(?.*)" - ); - private readonly Regex _timeRe = new(@"(?\d{1,2}):?(?\d{2})?\s?(?(?:a|p)\.?m\.?)"); // Regex to match time ranges like "10:30 a.m. - 12:00 p.m." or "10:30 a.m. - NOON" // Matches: time1 (optional dash time2/NOON), then location // The time group captures the full time range (including " - NOON" if present) + // Note: Input is normalized via SanitizeInput, so only regular hyphens need to be handled // Pattern breakdown: // - First time: (?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?)) - matches NOON or time with AM/PM (more flexible whitespace) - // - Optional range: (?:\s*[–-]\s*(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?))) - matches dash followed by NOON or time + // - Optional range: (?:\s*-\s*(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?))) - matches dash followed by NOON or time // - Location: (?:\s+(?.+))? - optional whitespace followed by location (capture group with explicit name) - private readonly Regex _timeLocationRegex = new(@"(?