Refactor event occurrence parsing to use dynamic month parsers and improve input normalization

This commit replaces individual month parsers with a dynamic array of month names, enhancing maintainability. The EventOccurrenceParser has been updated to utilize this new structure, ensuring consistent parsing of month names. Additionally, input normalization has been improved by standardizing hyphen handling and ensuring that all relevant parsing methods utilize the sanitized input. This change streamlines the parsing process and enhances overall robustness.
This commit is contained in:
2026-01-08 08:46:11 -05:00
parent f32ce649cd
commit 7ddc55f672
2 changed files with 121 additions and 90 deletions
+69 -29
View File
@@ -10,35 +10,27 @@ namespace Core.Parsers;
/// </summary>
public static class EventOccurrenceGrammar
{
// Months - all 12 months supported
private static readonly Parser<string> January = Parse.String("January").Text().Token();
private static readonly Parser<string> February = Parse.String("February").Text().Token();
private static readonly Parser<string> March = Parse.String("March").Text().Token();
private static readonly Parser<string> April = Parse.String("April").Text().Token();
private static readonly Parser<string> May = Parse.String("May").Text().Token();
private static readonly Parser<string> June = Parse.String("June").Text().Token();
private static readonly Parser<string> July = Parse.String("July").Text().Token();
private static readonly Parser<string> August = Parse.String("August").Text().Token();
private static readonly Parser<string> September = Parse.String("September").Text().Token();
private static readonly Parser<string> October = Parse.String("October").Text().Token();
private static readonly Parser<string> November = Parse.String("November").Text().Token();
private static readonly Parser<string> December = Parse.String("December").Text().Token();
/// <summary>
/// Array of all month names in order (January through December).
/// This is the single source of truth for month names used throughout the parser.
/// </summary>
public static readonly string[] MonthNames = new[]
{
"January", "February", "March", "April", "May", "June",
"July", "August", "September", "October", "November", "December"
};
// Build month parsers dynamically from MonthNames array
private static readonly Parser<string>[] MonthParsers = MonthNames
.Select(month => Parse.String(month).Text().Token())
.ToArray();
/// <summary>
/// Parser for month names (January through December).
/// Built dynamically from MonthNames array.
/// </summary>
public static readonly Parser<string> Month = January
.Or(February)
.Or(March)
.Or(April)
.Or(May)
.Or(June)
.Or(July)
.Or(August)
.Or(September)
.Or(October)
.Or(November)
.Or(December);
public static readonly Parser<string> Month = MonthParsers
.Aggregate((current, next) => current.Or(next));
/// <summary>
/// Parser for day of month (1-31, optional semicolon).
@@ -66,9 +58,10 @@ public static class EventOccurrenceGrammar
select $"{hour}:{(minute.IsDefined ? minute.Get() : "00")} {ampm}";
/// <summary>
/// Parser for hyphen characters (en-dash, hyphen, em-dash).
/// Parser for hyphen character.
/// Note: Input is assumed to be normalized (en-dash and em-dash converted to regular hyphen) via SanitizeInput.
/// </summary>
public static readonly Parser<char> Hyphen = Parse.Char('').Or(Parse.Char('-')).Or(Parse.Char('—'));
public static readonly Parser<char> Hyphen = Parse.Char('-');
/// <summary>
/// Parser for time values, including ranges and special values (NOON, TBD).
@@ -80,12 +73,13 @@ public static class EventOccurrenceGrammar
from dash in Hyphen.Then(_ => Parse.WhiteSpace.Many()).Optional()
from end in TimeValue.Or(Noon).Optional()
select end.IsDefined
? $"{start} {end.Get()}"
? $"{start} - {end.Get()}"
: start
);
/// <summary>
/// Parser for section headers: EventName [-—] (MS|HS).
/// Parser for section headers: EventName - (MS|HS).
/// Note: Input is assumed to be normalized (hyphens normalized) via SanitizeInput.
/// </summary>
public static readonly Parser<(string EventName, string SchoolLevel)> SectionHeader =
from eventName in Parse.AnyChar.Except(Hyphen).Many().Text().Token()
@@ -149,5 +143,51 @@ public static class EventOccurrenceGrammar
{
return line.TrimStart().StartsWith("#", StringComparison.Ordinal);
}
/// <summary>
/// Attempts to parse an occurrence line from the given text.
/// Returns null if parsing fails.
/// Strategy: Find the first month name in the line, then parse from there.
/// </summary>
public static (string Name, string Month, int Day, string TimeAndLocation)? TryParseOccurrenceLine(string line)
{
// Find the first occurrence of any month name (using normalized MonthNames array)
int monthIndex = -1;
string foundMonth = string.Empty;
foreach (var month in MonthNames)
{
var index = line.IndexOf(month, StringComparison.OrdinalIgnoreCase);
if (index >= 0 && (monthIndex < 0 || index < monthIndex))
{
monthIndex = index;
foundMonth = month;
}
}
if (monthIndex < 0)
return null;
// Extract name (everything before the month)
var name = line.Substring(0, monthIndex).Trim();
// Parse from the month onwards
var restOfLine = line.Substring(monthIndex);
try
{
var monthParser = Parse.String(foundMonth).Text().Token();
var result = from month in monthParser
from day in DayOfMonth.Token()
from timeAndLocation in Parse.AnyChar.Many().Text()
select (name, month, day, timeAndLocation.Trim());
var parsed = result.Parse(restOfLine);
return parsed;
}
catch
{
return null;
}
}
}
+50 -59
View File
@@ -27,25 +27,17 @@ public class EventOccurrenceParser
_locationConfig = locationConfig;
}
private Regex _re =
new (
@"" + //
@"(?<Name>^[^#].*)\s" +
@"(?<Month>January|February|March|April|May|June|July|August|September|October|November|December)\s" +
@"(?<DayOfMonth>\d{1,2});?\s" +
@"(?<TimeAndLocation>.*)"
);
private readonly Regex _timeRe = new(@"(?<Hour>\d{1,2}):?(?<Minute>\d{2})?\s?(?<APM>(?:a|p)\.?m\.?)");
// Regex to match time ranges like "10:30 a.m. - 12:00 p.m." or "10:30 a.m. - NOON"
// Matches: time1 (optional dash time2/NOON), then location
// The time group captures the full time range (including " - NOON" if present)
// Note: Input is normalized via SanitizeInput, so only regular hyphens need to be handled
// Pattern breakdown:
// - First time: (?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?)) - matches NOON or time with AM/PM (more flexible whitespace)
// - Optional range: (?:\s*[-]\s*(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?))) - matches dash followed by NOON or time
// - Optional range: (?:\s*-\s*(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?))) - matches dash followed by NOON or time
// - Location: (?:\s+(?<Location>.+))? - optional whitespace followed by location (capture group with explicit name)
private readonly Regex _timeLocationRegex = new(@"(?<Time>(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?))(?:\s*[-]\s*(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?)))?)(?:\s+(?<Location>.+))?");
private readonly Regex _timeLocationRegex = new(@"(?<Time>(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?))(?:\s*-\s*(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?)))?)(?:\s+(?<Location>.+))?");
public EventOccurrenceParserResult Parse()
{
@@ -57,21 +49,25 @@ public class EventOccurrenceParser
var lines = File.ReadLines(_txtFile.FullName);
foreach (var (line, index) in lines.Select((line, index) => (line, index + 1)))
{
var trimmedLine = line.Trim();
// Normalize input: trim and normalize hyphens (en-dash, em-dash -> regular hyphen)
// This allows the grammar parser to assume normalized input
var normalizedLine = SanitizeInput(line.Trim());
// Skip empty lines
if (string.IsNullOrWhiteSpace(trimmedLine))
if (string.IsNullOrWhiteSpace(normalizedLine))
continue;
// Skip comment lines (starting with "#") - use grammar parser
if (EventOccurrenceGrammar.IsCommentLine(trimmedLine))
if (EventOccurrenceGrammar.IsCommentLine(normalizedLine))
continue;
var match = _re.Match(trimmedLine);
if (!match.Success)
// Try to parse occurrence line using grammar parser
var occurrenceLine = EventOccurrenceGrammar.TryParseOccurrenceLine(normalizedLine);
if (!occurrenceLine.HasValue)
{
// Not an occurrence line, try other line types
// Try to parse section header using grammar parser
var sectionHeader = EventOccurrenceGrammar.TryParseSectionHeader(trimmedLine);
var sectionHeader = EventOccurrenceGrammar.TryParseSectionHeader(normalizedLine);
if (sectionHeader.HasValue)
{
var (eventNamePart, schoolLevel) = sectionHeader.Value;
@@ -88,9 +84,9 @@ public class EventOccurrenceParser
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = trimmedLine,
LineContent = normalizedLine,
IssueType = ParsingIssueType.UnmatchedLine,
Message = $"Section header '{eventNamePart} {schoolLevel}' found but no matching event definition (best match ratio: {Fuzz.Ratio(eventNamePart, _events.FirstOrDefault()?.Name ?? "")})"
Message = $"Section header '{eventNamePart} - {schoolLevel}' found but no matching event definition (best match ratio: {Fuzz.Ratio(eventNamePart, _events.FirstOrDefault()?.Name ?? "")})"
});
continue;
}
@@ -99,18 +95,18 @@ public class EventOccurrenceParser
}
// Check for General Schedule/Session using grammar parser
if (EventOccurrenceGrammar.IsGeneralSchedule(trimmedLine))
if (EventOccurrenceGrammar.IsGeneralSchedule(normalizedLine))
{
currentEventDefinition = EventDefinition.GeneralSchedule;
continue;
}
// Also check for simple "MS" or "HS" in line (backward compatibility)
if (trimmedLine.Contains("MS") || trimmedLine.Contains("HS"))
if (normalizedLine.Contains("MS") || normalizedLine.Contains("HS"))
{
var evt =
(from e in _events
let rat = Fuzz.Ratio(e.Name, trimmedLine)
let rat = Fuzz.Ratio(e.Name, normalizedLine)
where rat > 50
orderby rat descending
select e).FirstOrDefault();
@@ -119,9 +115,9 @@ public class EventOccurrenceParser
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = trimmedLine,
LineContent = normalizedLine,
IssueType = ParsingIssueType.UnmatchedLine,
Message = $"Section header with 'MS' or 'HS' found but no matching event definition (best match ratio: {Fuzz.Ratio(trimmedLine, _events.FirstOrDefault()?.Name ?? "")})"
Message = $"Section header with 'MS' or 'HS' found but no matching event definition (best match ratio: {Fuzz.Ratio(normalizedLine, _events.FirstOrDefault()?.Name ?? "")})"
});
continue;
}
@@ -134,19 +130,19 @@ public class EventOccurrenceParser
// - Start with lowercase or special characters (not event names)
// - Are parenthetical notes like "(Semifinalists only)"
// - Are informational text like "Schedule Posted on..."
if (IsContinuationLine(trimmedLine))
if (IsContinuationLine(normalizedLine))
{
continue;
}
// "Voting Delegates" section header is no longer used - occurrences are categorized by name pattern
// Track as unmatched line if it's not empty
if (!string.IsNullOrWhiteSpace(trimmedLine))
if (!string.IsNullOrWhiteSpace(normalizedLine))
{
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = trimmedLine,
LineContent = normalizedLine,
IssueType = ParsingIssueType.UnmatchedLine,
Message = "Line does not match expected format (Name Month Day Time/Location)"
});
@@ -154,11 +150,9 @@ public class EventOccurrenceParser
continue;
}
var occurrenceName = match.Groups["Name"].Captures[0].Value;
var month = match.Groups["Month"].Captures[0].Value;
var dayOfMonth = match.Groups["DayOfMonth"].Captures[0].Value;
var timeAndLocation = match.Groups["TimeAndLocation"].Captures[0].Value;
var (occurrenceName, month, dayOfMonthStr, timeAndLocation) = occurrenceLine.Value;
// Remove weekday suffix from occurrence name if present
occurrenceName = Regex.Replace(occurrenceName,
@"(?<Weekday>Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s?$", "").Trim();
@@ -171,30 +165,30 @@ public class EventOccurrenceParser
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = trimmedLine,
LineContent = normalizedLine,
IssueType = ParsingIssueType.MissingEventDefinition,
Message = $"Cannot determine event definition for occurrence: {occurrenceName}"
});
continue;
}
timeAndLocation = SanitizeInput(timeAndLocation);
// timeAndLocation is already normalized (hyphens normalized) since normalizedLine was sanitized
// Parse time and location using configurable patterns
var (time, location, locationParseSuccess) = ParseTimeAndLocation(timeAndLocation, index, trimmedLine, issues);
var (time, location, locationParseSuccess) = ParseTimeAndLocation(timeAndLocation, index, normalizedLine, issues);
// Parse date
DateOnly? startDate = null;
try
{
startDate = ParseDate(month, dayOfMonth, DateTime.Now.Year);
startDate = ParseDate(month, dayOfMonthStr.ToString(), DateTime.Now.Year);
}
catch (Exception ex)
{
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = trimmedLine,
LineContent = normalizedLine,
IssueType = ParsingIssueType.DateParseFailure,
Message = $"Failed to parse date: {ex.Message}"
});
@@ -212,7 +206,7 @@ public class EventOccurrenceParser
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = trimmedLine,
LineContent = normalizedLine,
IssueType = ParsingIssueType.TimeParseFailure,
Message = $"Failed to parse time '{time}': {ex.Message}"
});
@@ -229,7 +223,7 @@ public class EventOccurrenceParser
Name = occurrenceName,
StartTime = t,
Time = $"{time}",
Date = $"{month} {dayOfMonth}",
Date = $"{month} {dayOfMonthStr}",
Location = location
};
@@ -311,23 +305,16 @@ public class EventOccurrenceParser
private DateOnly ParseDate(string month, string dayOfMonth, int year)
{
int monthNum = month.ToLower() switch
{
"january" => 1,
"february" => 2,
"march" => 3,
"april" => 4,
"may" => 5,
"june" => 6,
"july" => 7,
"august" => 8,
"september" => 9,
"october" => 10,
"november" => 11,
"december" => 12,
_ => throw new ArgumentException($"Invalid month: {month}", nameof(month))
};
// Use normalized MonthNames array from grammar
var monthLower = month.ToLower();
var monthIndex = Array.FindIndex(EventOccurrenceGrammar.MonthNames,
m => m.ToLower() == monthLower);
if (monthIndex < 0)
throw new ArgumentException($"Invalid month: {month}", nameof(month));
// Month index is 0-based, month number is 1-based
int monthNum = monthIndex + 1;
var day = int.Parse(dayOfMonth);
return new DateOnly(year, monthNum, day);
}
@@ -428,11 +415,13 @@ public class EventOccurrenceParser
return string.Empty;
// Remove leading dashes and whitespace
locationText = locationText.TrimStart('', '-', ' ', '\t');
// Note: Input is normalized, so only regular hyphens need to be handled
locationText = locationText.TrimStart('-', ' ', '\t');
// Try to match and remove time patterns at the start
// Pattern 1: Dash, whitespace, time (e.g., " 12:15 p.m. " or " NOON ")
var dashTimePattern = new Regex(@"^[-]\s+(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)\s+", RegexOptions.IgnoreCase);
// Pattern 1: Dash, whitespace, time (e.g., "- 12:15 p.m. " or "- NOON ")
// Note: Input is normalized, so only regular hyphens need to be handled
var dashTimePattern = new Regex(@"^-\s+(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)\s+", RegexOptions.IgnoreCase);
locationText = dashTimePattern.Replace(locationText, "").Trim();
// Pattern 2: Time without dash at start (e.g., "12:15 p.m. " or "NOON ")
@@ -440,7 +429,8 @@ public class EventOccurrenceParser
locationText = timePatternAtStart.Replace(locationText, "").Trim();
// Pattern 3: Any remaining dash-time combinations (more flexible)
var remainingDashTime = new Regex(@"^[-]\s*(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)\s*", RegexOptions.IgnoreCase);
// Note: Input is normalized, so only regular hyphens need to be handled
var remainingDashTime = new Regex(@"^-\s*(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)\s*", RegexOptions.IgnoreCase);
locationText = remainingDashTime.Replace(locationText, "").Trim();
// Pattern 4: Remove any standalone time at the start (handles cases where dash was already removed)
@@ -521,7 +511,8 @@ public class EventOccurrenceParser
hour = 12;
else
{
var timeMatch = _timeRe.Match(time.ToLower());
// Regex is case-insensitive, so ToLower() is not needed
var timeMatch = _timeRe.Match(time);
if (timeMatch.Success)
{
hour = int.Parse(timeMatch.Groups["Hour"].Captures[0].Value);