Refactor event occurrence parsing to use dynamic month parsers and improve input normalization
This commit replaces individual month parsers with a dynamic array of month names, enhancing maintainability. The EventOccurrenceParser has been updated to utilize this new structure, ensuring consistent parsing of month names. Additionally, input normalization has been improved by standardizing hyphen handling and ensuring that all relevant parsing methods utilize the sanitized input. This change streamlines the parsing process and enhances overall robustness.
This commit is contained in:
@@ -10,35 +10,27 @@ namespace Core.Parsers;
|
||||
/// </summary>
|
||||
public static class EventOccurrenceGrammar
|
||||
{
|
||||
// Months - all 12 months supported
|
||||
private static readonly Parser<string> January = Parse.String("January").Text().Token();
|
||||
private static readonly Parser<string> February = Parse.String("February").Text().Token();
|
||||
private static readonly Parser<string> March = Parse.String("March").Text().Token();
|
||||
private static readonly Parser<string> April = Parse.String("April").Text().Token();
|
||||
private static readonly Parser<string> May = Parse.String("May").Text().Token();
|
||||
private static readonly Parser<string> June = Parse.String("June").Text().Token();
|
||||
private static readonly Parser<string> July = Parse.String("July").Text().Token();
|
||||
private static readonly Parser<string> August = Parse.String("August").Text().Token();
|
||||
private static readonly Parser<string> September = Parse.String("September").Text().Token();
|
||||
private static readonly Parser<string> October = Parse.String("October").Text().Token();
|
||||
private static readonly Parser<string> November = Parse.String("November").Text().Token();
|
||||
private static readonly Parser<string> December = Parse.String("December").Text().Token();
|
||||
/// <summary>
|
||||
/// Array of all month names in order (January through December).
|
||||
/// This is the single source of truth for month names used throughout the parser.
|
||||
/// </summary>
|
||||
public static readonly string[] MonthNames = new[]
|
||||
{
|
||||
"January", "February", "March", "April", "May", "June",
|
||||
"July", "August", "September", "October", "November", "December"
|
||||
};
|
||||
|
||||
// Build month parsers dynamically from MonthNames array
|
||||
private static readonly Parser<string>[] MonthParsers = MonthNames
|
||||
.Select(month => Parse.String(month).Text().Token())
|
||||
.ToArray();
|
||||
|
||||
/// <summary>
|
||||
/// Parser for month names (January through December).
|
||||
/// Built dynamically from MonthNames array.
|
||||
/// </summary>
|
||||
public static readonly Parser<string> Month = January
|
||||
.Or(February)
|
||||
.Or(March)
|
||||
.Or(April)
|
||||
.Or(May)
|
||||
.Or(June)
|
||||
.Or(July)
|
||||
.Or(August)
|
||||
.Or(September)
|
||||
.Or(October)
|
||||
.Or(November)
|
||||
.Or(December);
|
||||
public static readonly Parser<string> Month = MonthParsers
|
||||
.Aggregate((current, next) => current.Or(next));
|
||||
|
||||
/// <summary>
|
||||
/// Parser for day of month (1-31, optional semicolon).
|
||||
@@ -66,9 +58,10 @@ public static class EventOccurrenceGrammar
|
||||
select $"{hour}:{(minute.IsDefined ? minute.Get() : "00")} {ampm}";
|
||||
|
||||
/// <summary>
|
||||
/// Parser for hyphen characters (en-dash, hyphen, em-dash).
|
||||
/// Parser for hyphen character.
|
||||
/// Note: Input is assumed to be normalized (en-dash and em-dash converted to regular hyphen) via SanitizeInput.
|
||||
/// </summary>
|
||||
public static readonly Parser<char> Hyphen = Parse.Char('–').Or(Parse.Char('-')).Or(Parse.Char('—'));
|
||||
public static readonly Parser<char> Hyphen = Parse.Char('-');
|
||||
|
||||
/// <summary>
|
||||
/// Parser for time values, including ranges and special values (NOON, TBD).
|
||||
@@ -80,12 +73,13 @@ public static class EventOccurrenceGrammar
|
||||
from dash in Hyphen.Then(_ => Parse.WhiteSpace.Many()).Optional()
|
||||
from end in TimeValue.Or(Noon).Optional()
|
||||
select end.IsDefined
|
||||
? $"{start} – {end.Get()}"
|
||||
? $"{start} - {end.Get()}"
|
||||
: start
|
||||
);
|
||||
|
||||
/// <summary>
|
||||
/// Parser for section headers: EventName [–-—] (MS|HS).
|
||||
/// Parser for section headers: EventName - (MS|HS).
|
||||
/// Note: Input is assumed to be normalized (hyphens normalized) via SanitizeInput.
|
||||
/// </summary>
|
||||
public static readonly Parser<(string EventName, string SchoolLevel)> SectionHeader =
|
||||
from eventName in Parse.AnyChar.Except(Hyphen).Many().Text().Token()
|
||||
@@ -149,5 +143,51 @@ public static class EventOccurrenceGrammar
|
||||
{
|
||||
return line.TrimStart().StartsWith("#", StringComparison.Ordinal);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Attempts to parse an occurrence line from the given text.
|
||||
/// Returns null if parsing fails.
|
||||
/// Strategy: Find the first month name in the line, then parse from there.
|
||||
/// </summary>
|
||||
public static (string Name, string Month, int Day, string TimeAndLocation)? TryParseOccurrenceLine(string line)
|
||||
{
|
||||
// Find the first occurrence of any month name (using normalized MonthNames array)
|
||||
int monthIndex = -1;
|
||||
string foundMonth = string.Empty;
|
||||
|
||||
foreach (var month in MonthNames)
|
||||
{
|
||||
var index = line.IndexOf(month, StringComparison.OrdinalIgnoreCase);
|
||||
if (index >= 0 && (monthIndex < 0 || index < monthIndex))
|
||||
{
|
||||
monthIndex = index;
|
||||
foundMonth = month;
|
||||
}
|
||||
}
|
||||
|
||||
if (monthIndex < 0)
|
||||
return null;
|
||||
|
||||
// Extract name (everything before the month)
|
||||
var name = line.Substring(0, monthIndex).Trim();
|
||||
|
||||
// Parse from the month onwards
|
||||
var restOfLine = line.Substring(monthIndex);
|
||||
try
|
||||
{
|
||||
var monthParser = Parse.String(foundMonth).Text().Token();
|
||||
var result = from month in monthParser
|
||||
from day in DayOfMonth.Token()
|
||||
from timeAndLocation in Parse.AnyChar.Many().Text()
|
||||
select (name, month, day, timeAndLocation.Trim());
|
||||
|
||||
var parsed = result.Parse(restOfLine);
|
||||
return parsed;
|
||||
}
|
||||
catch
|
||||
{
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -27,25 +27,17 @@ public class EventOccurrenceParser
|
||||
_locationConfig = locationConfig;
|
||||
}
|
||||
|
||||
private Regex _re =
|
||||
new (
|
||||
@"" + //
|
||||
@"(?<Name>^[^#].*)\s" +
|
||||
@"(?<Month>January|February|March|April|May|June|July|August|September|October|November|December)\s" +
|
||||
@"(?<DayOfMonth>\d{1,2});?\s" +
|
||||
@"(?<TimeAndLocation>.*)"
|
||||
);
|
||||
|
||||
private readonly Regex _timeRe = new(@"(?<Hour>\d{1,2}):?(?<Minute>\d{2})?\s?(?<APM>(?:a|p)\.?m\.?)");
|
||||
|
||||
// Regex to match time ranges like "10:30 a.m. - 12:00 p.m." or "10:30 a.m. - NOON"
|
||||
// Matches: time1 (optional dash time2/NOON), then location
|
||||
// The time group captures the full time range (including " - NOON" if present)
|
||||
// Note: Input is normalized via SanitizeInput, so only regular hyphens need to be handled
|
||||
// Pattern breakdown:
|
||||
// - First time: (?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?)) - matches NOON or time with AM/PM (more flexible whitespace)
|
||||
// - Optional range: (?:\s*[–-]\s*(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?))) - matches dash followed by NOON or time
|
||||
// - Optional range: (?:\s*-\s*(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?))) - matches dash followed by NOON or time
|
||||
// - Location: (?:\s+(?<Location>.+))? - optional whitespace followed by location (capture group with explicit name)
|
||||
private readonly Regex _timeLocationRegex = new(@"(?<Time>(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?))(?:\s*[–-]\s*(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?)))?)(?:\s+(?<Location>.+))?");
|
||||
private readonly Regex _timeLocationRegex = new(@"(?<Time>(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?))(?:\s*-\s*(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?)))?)(?:\s+(?<Location>.+))?");
|
||||
|
||||
public EventOccurrenceParserResult Parse()
|
||||
{
|
||||
@@ -57,21 +49,25 @@ public class EventOccurrenceParser
|
||||
var lines = File.ReadLines(_txtFile.FullName);
|
||||
foreach (var (line, index) in lines.Select((line, index) => (line, index + 1)))
|
||||
{
|
||||
var trimmedLine = line.Trim();
|
||||
// Normalize input: trim and normalize hyphens (en-dash, em-dash -> regular hyphen)
|
||||
// This allows the grammar parser to assume normalized input
|
||||
var normalizedLine = SanitizeInput(line.Trim());
|
||||
|
||||
// Skip empty lines
|
||||
if (string.IsNullOrWhiteSpace(trimmedLine))
|
||||
if (string.IsNullOrWhiteSpace(normalizedLine))
|
||||
continue;
|
||||
|
||||
// Skip comment lines (starting with "#") - use grammar parser
|
||||
if (EventOccurrenceGrammar.IsCommentLine(trimmedLine))
|
||||
if (EventOccurrenceGrammar.IsCommentLine(normalizedLine))
|
||||
continue;
|
||||
|
||||
var match = _re.Match(trimmedLine);
|
||||
if (!match.Success)
|
||||
// Try to parse occurrence line using grammar parser
|
||||
var occurrenceLine = EventOccurrenceGrammar.TryParseOccurrenceLine(normalizedLine);
|
||||
if (!occurrenceLine.HasValue)
|
||||
{
|
||||
// Not an occurrence line, try other line types
|
||||
// Try to parse section header using grammar parser
|
||||
var sectionHeader = EventOccurrenceGrammar.TryParseSectionHeader(trimmedLine);
|
||||
var sectionHeader = EventOccurrenceGrammar.TryParseSectionHeader(normalizedLine);
|
||||
if (sectionHeader.HasValue)
|
||||
{
|
||||
var (eventNamePart, schoolLevel) = sectionHeader.Value;
|
||||
@@ -88,9 +84,9 @@ public class EventOccurrenceParser
|
||||
issues.Add(new ParsingIssue
|
||||
{
|
||||
LineNumber = index,
|
||||
LineContent = trimmedLine,
|
||||
LineContent = normalizedLine,
|
||||
IssueType = ParsingIssueType.UnmatchedLine,
|
||||
Message = $"Section header '{eventNamePart} – {schoolLevel}' found but no matching event definition (best match ratio: {Fuzz.Ratio(eventNamePart, _events.FirstOrDefault()?.Name ?? "")})"
|
||||
Message = $"Section header '{eventNamePart} - {schoolLevel}' found but no matching event definition (best match ratio: {Fuzz.Ratio(eventNamePart, _events.FirstOrDefault()?.Name ?? "")})"
|
||||
});
|
||||
continue;
|
||||
}
|
||||
@@ -99,18 +95,18 @@ public class EventOccurrenceParser
|
||||
}
|
||||
|
||||
// Check for General Schedule/Session using grammar parser
|
||||
if (EventOccurrenceGrammar.IsGeneralSchedule(trimmedLine))
|
||||
if (EventOccurrenceGrammar.IsGeneralSchedule(normalizedLine))
|
||||
{
|
||||
currentEventDefinition = EventDefinition.GeneralSchedule;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Also check for simple "MS" or "HS" in line (backward compatibility)
|
||||
if (trimmedLine.Contains("MS") || trimmedLine.Contains("HS"))
|
||||
if (normalizedLine.Contains("MS") || normalizedLine.Contains("HS"))
|
||||
{
|
||||
var evt =
|
||||
(from e in _events
|
||||
let rat = Fuzz.Ratio(e.Name, trimmedLine)
|
||||
let rat = Fuzz.Ratio(e.Name, normalizedLine)
|
||||
where rat > 50
|
||||
orderby rat descending
|
||||
select e).FirstOrDefault();
|
||||
@@ -119,9 +115,9 @@ public class EventOccurrenceParser
|
||||
issues.Add(new ParsingIssue
|
||||
{
|
||||
LineNumber = index,
|
||||
LineContent = trimmedLine,
|
||||
LineContent = normalizedLine,
|
||||
IssueType = ParsingIssueType.UnmatchedLine,
|
||||
Message = $"Section header with 'MS' or 'HS' found but no matching event definition (best match ratio: {Fuzz.Ratio(trimmedLine, _events.FirstOrDefault()?.Name ?? "")})"
|
||||
Message = $"Section header with 'MS' or 'HS' found but no matching event definition (best match ratio: {Fuzz.Ratio(normalizedLine, _events.FirstOrDefault()?.Name ?? "")})"
|
||||
});
|
||||
continue;
|
||||
}
|
||||
@@ -134,19 +130,19 @@ public class EventOccurrenceParser
|
||||
// - Start with lowercase or special characters (not event names)
|
||||
// - Are parenthetical notes like "(Semifinalists only)"
|
||||
// - Are informational text like "Schedule Posted on..."
|
||||
if (IsContinuationLine(trimmedLine))
|
||||
if (IsContinuationLine(normalizedLine))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// "Voting Delegates" section header is no longer used - occurrences are categorized by name pattern
|
||||
// Track as unmatched line if it's not empty
|
||||
if (!string.IsNullOrWhiteSpace(trimmedLine))
|
||||
if (!string.IsNullOrWhiteSpace(normalizedLine))
|
||||
{
|
||||
issues.Add(new ParsingIssue
|
||||
{
|
||||
LineNumber = index,
|
||||
LineContent = trimmedLine,
|
||||
LineContent = normalizedLine,
|
||||
IssueType = ParsingIssueType.UnmatchedLine,
|
||||
Message = "Line does not match expected format (Name Month Day Time/Location)"
|
||||
});
|
||||
@@ -154,11 +150,9 @@ public class EventOccurrenceParser
|
||||
continue;
|
||||
}
|
||||
|
||||
var occurrenceName = match.Groups["Name"].Captures[0].Value;
|
||||
var month = match.Groups["Month"].Captures[0].Value;
|
||||
var dayOfMonth = match.Groups["DayOfMonth"].Captures[0].Value;
|
||||
var timeAndLocation = match.Groups["TimeAndLocation"].Captures[0].Value;
|
||||
var (occurrenceName, month, dayOfMonthStr, timeAndLocation) = occurrenceLine.Value;
|
||||
|
||||
// Remove weekday suffix from occurrence name if present
|
||||
occurrenceName = Regex.Replace(occurrenceName,
|
||||
@"(?<Weekday>Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s?$", "").Trim();
|
||||
|
||||
@@ -171,30 +165,30 @@ public class EventOccurrenceParser
|
||||
issues.Add(new ParsingIssue
|
||||
{
|
||||
LineNumber = index,
|
||||
LineContent = trimmedLine,
|
||||
LineContent = normalizedLine,
|
||||
IssueType = ParsingIssueType.MissingEventDefinition,
|
||||
Message = $"Cannot determine event definition for occurrence: {occurrenceName}"
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
timeAndLocation = SanitizeInput(timeAndLocation);
|
||||
// timeAndLocation is already normalized (hyphens normalized) since normalizedLine was sanitized
|
||||
|
||||
// Parse time and location using configurable patterns
|
||||
var (time, location, locationParseSuccess) = ParseTimeAndLocation(timeAndLocation, index, trimmedLine, issues);
|
||||
var (time, location, locationParseSuccess) = ParseTimeAndLocation(timeAndLocation, index, normalizedLine, issues);
|
||||
|
||||
// Parse date
|
||||
DateOnly? startDate = null;
|
||||
try
|
||||
{
|
||||
startDate = ParseDate(month, dayOfMonth, DateTime.Now.Year);
|
||||
startDate = ParseDate(month, dayOfMonthStr.ToString(), DateTime.Now.Year);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
issues.Add(new ParsingIssue
|
||||
{
|
||||
LineNumber = index,
|
||||
LineContent = trimmedLine,
|
||||
LineContent = normalizedLine,
|
||||
IssueType = ParsingIssueType.DateParseFailure,
|
||||
Message = $"Failed to parse date: {ex.Message}"
|
||||
});
|
||||
@@ -212,7 +206,7 @@ public class EventOccurrenceParser
|
||||
issues.Add(new ParsingIssue
|
||||
{
|
||||
LineNumber = index,
|
||||
LineContent = trimmedLine,
|
||||
LineContent = normalizedLine,
|
||||
IssueType = ParsingIssueType.TimeParseFailure,
|
||||
Message = $"Failed to parse time '{time}': {ex.Message}"
|
||||
});
|
||||
@@ -229,7 +223,7 @@ public class EventOccurrenceParser
|
||||
Name = occurrenceName,
|
||||
StartTime = t,
|
||||
Time = $"{time}",
|
||||
Date = $"{month} {dayOfMonth}",
|
||||
Date = $"{month} {dayOfMonthStr}",
|
||||
Location = location
|
||||
};
|
||||
|
||||
@@ -311,23 +305,16 @@ public class EventOccurrenceParser
|
||||
|
||||
private DateOnly ParseDate(string month, string dayOfMonth, int year)
|
||||
{
|
||||
int monthNum = month.ToLower() switch
|
||||
{
|
||||
"january" => 1,
|
||||
"february" => 2,
|
||||
"march" => 3,
|
||||
"april" => 4,
|
||||
"may" => 5,
|
||||
"june" => 6,
|
||||
"july" => 7,
|
||||
"august" => 8,
|
||||
"september" => 9,
|
||||
"october" => 10,
|
||||
"november" => 11,
|
||||
"december" => 12,
|
||||
_ => throw new ArgumentException($"Invalid month: {month}", nameof(month))
|
||||
};
|
||||
// Use normalized MonthNames array from grammar
|
||||
var monthLower = month.ToLower();
|
||||
var monthIndex = Array.FindIndex(EventOccurrenceGrammar.MonthNames,
|
||||
m => m.ToLower() == monthLower);
|
||||
|
||||
if (monthIndex < 0)
|
||||
throw new ArgumentException($"Invalid month: {month}", nameof(month));
|
||||
|
||||
// Month index is 0-based, month number is 1-based
|
||||
int monthNum = monthIndex + 1;
|
||||
var day = int.Parse(dayOfMonth);
|
||||
return new DateOnly(year, monthNum, day);
|
||||
}
|
||||
@@ -428,11 +415,13 @@ public class EventOccurrenceParser
|
||||
return string.Empty;
|
||||
|
||||
// Remove leading dashes and whitespace
|
||||
locationText = locationText.TrimStart('–', '-', ' ', '\t');
|
||||
// Note: Input is normalized, so only regular hyphens need to be handled
|
||||
locationText = locationText.TrimStart('-', ' ', '\t');
|
||||
|
||||
// Try to match and remove time patterns at the start
|
||||
// Pattern 1: Dash, whitespace, time (e.g., "– 12:15 p.m. " or "– NOON ")
|
||||
var dashTimePattern = new Regex(@"^[–-]\s+(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)\s+", RegexOptions.IgnoreCase);
|
||||
// Pattern 1: Dash, whitespace, time (e.g., "- 12:15 p.m. " or "- NOON ")
|
||||
// Note: Input is normalized, so only regular hyphens need to be handled
|
||||
var dashTimePattern = new Regex(@"^-\s+(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)\s+", RegexOptions.IgnoreCase);
|
||||
locationText = dashTimePattern.Replace(locationText, "").Trim();
|
||||
|
||||
// Pattern 2: Time without dash at start (e.g., "12:15 p.m. " or "NOON ")
|
||||
@@ -440,7 +429,8 @@ public class EventOccurrenceParser
|
||||
locationText = timePatternAtStart.Replace(locationText, "").Trim();
|
||||
|
||||
// Pattern 3: Any remaining dash-time combinations (more flexible)
|
||||
var remainingDashTime = new Regex(@"^[–-]\s*(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)\s*", RegexOptions.IgnoreCase);
|
||||
// Note: Input is normalized, so only regular hyphens need to be handled
|
||||
var remainingDashTime = new Regex(@"^-\s*(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)\s*", RegexOptions.IgnoreCase);
|
||||
locationText = remainingDashTime.Replace(locationText, "").Trim();
|
||||
|
||||
// Pattern 4: Remove any standalone time at the start (handles cases where dash was already removed)
|
||||
@@ -521,7 +511,8 @@ public class EventOccurrenceParser
|
||||
hour = 12;
|
||||
else
|
||||
{
|
||||
var timeMatch = _timeRe.Match(time.ToLower());
|
||||
// Regex is case-insensitive, so ToLower() is not needed
|
||||
var timeMatch = _timeRe.Match(time);
|
||||
if (timeMatch.Success)
|
||||
{
|
||||
hour = int.Parse(timeMatch.Groups["Hour"].Captures[0].Value);
|
||||
|
||||
Reference in New Issue
Block a user