7ddc55f672
This commit replaces individual month parsers with a dynamic array of month names, enhancing maintainability. The EventOccurrenceParser has been updated to utilize this new structure, ensuring consistent parsing of month names. Additionally, input normalization has been improved by standardizing hyphen handling and ensuring that all relevant parsing methods utilize the sanitized input. This change streamlines the parsing process and enhances overall robustness.
535 lines
21 KiB
C#
535 lines
21 KiB
C#
using System.Text.RegularExpressions;
|
||
using Core.Entities;
|
||
using Core.Models;
|
||
using FuzzySharp;
|
||
|
||
namespace Core.Parsers;
|
||
|
||
/// <summary>
|
||
/// Result of parsing event occurrence file, containing both occurrences and parsing issues.
|
||
/// </summary>
|
||
public class EventOccurrenceParserResult
|
||
{
|
||
public IDictionary<EventDefinition, List<EventOccurrence>> Occurrences { get; set; } = new Dictionary<EventDefinition, List<EventOccurrence>>();
|
||
public List<ParsingIssue> Issues { get; set; } = new();
|
||
}
|
||
|
||
public class EventOccurrenceParser
|
||
{
|
||
private FileSystemInfo _txtFile;
|
||
private ICollection<EventDefinition> _events;
|
||
private LocationParsingConfiguration? _locationConfig;
|
||
|
||
public EventOccurrenceParser(FileSystemInfo txtFile, ICollection<EventDefinition> events, LocationParsingConfiguration? locationConfig = null)
|
||
{
|
||
_events = events;
|
||
_txtFile = txtFile;
|
||
_locationConfig = locationConfig;
|
||
}
|
||
|
||
private readonly Regex _timeRe = new(@"(?<Hour>\d{1,2}):?(?<Minute>\d{2})?\s?(?<APM>(?:a|p)\.?m\.?)");
|
||
|
||
// Regex to match time ranges like "10:30 a.m. - 12:00 p.m." or "10:30 a.m. - NOON"
|
||
// Matches: time1 (optional dash time2/NOON), then location
|
||
// The time group captures the full time range (including " - NOON" if present)
|
||
// Note: Input is normalized via SanitizeInput, so only regular hyphens need to be handled
|
||
// Pattern breakdown:
|
||
// - First time: (?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?)) - matches NOON or time with AM/PM (more flexible whitespace)
|
||
// - Optional range: (?:\s*-\s*(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?))) - matches dash followed by NOON or time
|
||
// - Location: (?:\s+(?<Location>.+))? - optional whitespace followed by location (capture group with explicit name)
|
||
private readonly Regex _timeLocationRegex = new(@"(?<Time>(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?))(?:\s*-\s*(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?)))?)(?:\s+(?<Location>.+))?");
|
||
|
||
public EventOccurrenceParserResult Parse()
|
||
{
|
||
var result = new EventOccurrenceParserResult();
|
||
var occurrences = result.Occurrences;
|
||
var issues = result.Issues;
|
||
EventDefinition? currentEventDefinition = null;
|
||
|
||
var lines = File.ReadLines(_txtFile.FullName);
|
||
foreach (var (line, index) in lines.Select((line, index) => (line, index + 1)))
|
||
{
|
||
// Normalize input: trim and normalize hyphens (en-dash, em-dash -> regular hyphen)
|
||
// This allows the grammar parser to assume normalized input
|
||
var normalizedLine = SanitizeInput(line.Trim());
|
||
|
||
// Skip empty lines
|
||
if (string.IsNullOrWhiteSpace(normalizedLine))
|
||
continue;
|
||
|
||
// Skip comment lines (starting with "#") - use grammar parser
|
||
if (EventOccurrenceGrammar.IsCommentLine(normalizedLine))
|
||
continue;
|
||
|
||
// Try to parse occurrence line using grammar parser
|
||
var occurrenceLine = EventOccurrenceGrammar.TryParseOccurrenceLine(normalizedLine);
|
||
if (!occurrenceLine.HasValue)
|
||
{
|
||
// Not an occurrence line, try other line types
|
||
// Try to parse section header using grammar parser
|
||
var sectionHeader = EventOccurrenceGrammar.TryParseSectionHeader(normalizedLine);
|
||
if (sectionHeader.HasValue)
|
||
{
|
||
var (eventNamePart, schoolLevel) = sectionHeader.Value;
|
||
|
||
// Use fuzzy matching to find the best matching event definition
|
||
var evt =
|
||
(from e in _events
|
||
let rat = Fuzz.Ratio(e.Name, eventNamePart)
|
||
where rat > 50
|
||
orderby rat descending
|
||
select e).FirstOrDefault();
|
||
if (evt == null)
|
||
{
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = index,
|
||
LineContent = normalizedLine,
|
||
IssueType = ParsingIssueType.UnmatchedLine,
|
||
Message = $"Section header '{eventNamePart} - {schoolLevel}' found but no matching event definition (best match ratio: {Fuzz.Ratio(eventNamePart, _events.FirstOrDefault()?.Name ?? "")})"
|
||
});
|
||
continue;
|
||
}
|
||
currentEventDefinition = evt;
|
||
continue;
|
||
}
|
||
|
||
// Check for General Schedule/Session using grammar parser
|
||
if (EventOccurrenceGrammar.IsGeneralSchedule(normalizedLine))
|
||
{
|
||
currentEventDefinition = EventDefinition.GeneralSchedule;
|
||
continue;
|
||
}
|
||
|
||
// Also check for simple "MS" or "HS" in line (backward compatibility)
|
||
if (normalizedLine.Contains("MS") || normalizedLine.Contains("HS"))
|
||
{
|
||
var evt =
|
||
(from e in _events
|
||
let rat = Fuzz.Ratio(e.Name, normalizedLine)
|
||
where rat > 50
|
||
orderby rat descending
|
||
select e).FirstOrDefault();
|
||
if (evt == null)
|
||
{
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = index,
|
||
LineContent = normalizedLine,
|
||
IssueType = ParsingIssueType.UnmatchedLine,
|
||
Message = $"Section header with 'MS' or 'HS' found but no matching event definition (best match ratio: {Fuzz.Ratio(normalizedLine, _events.FirstOrDefault()?.Name ?? "")})"
|
||
});
|
||
continue;
|
||
}
|
||
currentEventDefinition = evt;
|
||
continue;
|
||
}
|
||
|
||
// Skip continuation lines (lines that look like they're continuing from previous line)
|
||
// These are typically lines that:
|
||
// - Start with lowercase or special characters (not event names)
|
||
// - Are parenthetical notes like "(Semifinalists only)"
|
||
// - Are informational text like "Schedule Posted on..."
|
||
if (IsContinuationLine(normalizedLine))
|
||
{
|
||
continue;
|
||
}
|
||
|
||
// "Voting Delegates" section header is no longer used - occurrences are categorized by name pattern
|
||
// Track as unmatched line if it's not empty
|
||
if (!string.IsNullOrWhiteSpace(normalizedLine))
|
||
{
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = index,
|
||
LineContent = normalizedLine,
|
||
IssueType = ParsingIssueType.UnmatchedLine,
|
||
Message = "Line does not match expected format (Name Month Day Time/Location)"
|
||
});
|
||
}
|
||
continue;
|
||
}
|
||
|
||
var (occurrenceName, month, dayOfMonthStr, timeAndLocation) = occurrenceLine.Value;
|
||
|
||
// Remove weekday suffix from occurrence name if present
|
||
occurrenceName = Regex.Replace(occurrenceName,
|
||
@"(?<Weekday>Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s?$", "").Trim();
|
||
|
||
// Determine event definition based on occurrence name pattern or current section
|
||
EventDefinition? eventDefinition = DetermineEventDefinition(occurrenceName, currentEventDefinition);
|
||
|
||
// Track issue if we can't determine the event definition
|
||
if (eventDefinition == null)
|
||
{
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = index,
|
||
LineContent = normalizedLine,
|
||
IssueType = ParsingIssueType.MissingEventDefinition,
|
||
Message = $"Cannot determine event definition for occurrence: {occurrenceName}"
|
||
});
|
||
continue;
|
||
}
|
||
|
||
// timeAndLocation is already normalized (hyphens normalized) since normalizedLine was sanitized
|
||
|
||
// Parse time and location using configurable patterns
|
||
var (time, location, locationParseSuccess) = ParseTimeAndLocation(timeAndLocation, index, normalizedLine, issues);
|
||
|
||
// Parse date
|
||
DateOnly? startDate = null;
|
||
try
|
||
{
|
||
startDate = ParseDate(month, dayOfMonthStr.ToString(), DateTime.Now.Year);
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = index,
|
||
LineContent = normalizedLine,
|
||
IssueType = ParsingIssueType.DateParseFailure,
|
||
Message = $"Failed to parse date: {ex.Message}"
|
||
});
|
||
continue;
|
||
}
|
||
|
||
// Parse time
|
||
TimeOnly? startTime = null;
|
||
try
|
||
{
|
||
startTime = ParseStartTime(time);
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = index,
|
||
LineContent = normalizedLine,
|
||
IssueType = ParsingIssueType.TimeParseFailure,
|
||
Message = $"Failed to parse time '{time}': {ex.Message}"
|
||
});
|
||
continue;
|
||
}
|
||
|
||
if (startDate == null || startTime == null)
|
||
continue;
|
||
|
||
var t = new DateTime(startDate.Value, startTime.Value);
|
||
|
||
var eventOccurrence = new EventOccurrence
|
||
{
|
||
Name = occurrenceName,
|
||
StartTime = t,
|
||
Time = $"{time}",
|
||
Date = $"{month} {dayOfMonthStr}",
|
||
Location = location
|
||
};
|
||
|
||
if (!occurrences.ContainsKey(eventDefinition))
|
||
occurrences.Add(eventDefinition, []);
|
||
occurrences[eventDefinition].Add(eventOccurrence);
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
/// <summary>
|
||
/// Determines the EventDefinition for an occurrence based on its name pattern or current section context.
|
||
/// </summary>
|
||
private EventDefinition? DetermineEventDefinition(string occurrenceName, EventDefinition? currentEventDefinition)
|
||
{
|
||
// Check for special event name patterns first (regardless of current section)
|
||
if (occurrenceName.Contains("Meet the Candidates Session", StringComparison.OrdinalIgnoreCase))
|
||
return EventDefinition.MeetTheCandidates;
|
||
|
||
if (occurrenceName.Contains("Chapter Officer Meeting", StringComparison.OrdinalIgnoreCase))
|
||
return EventDefinition.ChapterOfficerMeeting;
|
||
|
||
if (occurrenceName.Contains("Voting Delegate Meeting", StringComparison.OrdinalIgnoreCase))
|
||
return EventDefinition.VotingDelegateMeeting;
|
||
|
||
// If we're in a General Schedule/Session section and no pattern matched, use GeneralSchedule
|
||
if (currentEventDefinition == EventDefinition.GeneralSchedule)
|
||
return EventDefinition.GeneralSchedule;
|
||
|
||
// If we have a current event definition from section header (e.g., regular events), use it
|
||
if (currentEventDefinition != null)
|
||
return currentEventDefinition;
|
||
|
||
// Cannot determine event definition
|
||
return null;
|
||
}
|
||
|
||
/// <summary>
|
||
/// Determines if a line is a continuation/wrapped line that should be skipped.
|
||
/// </summary>
|
||
private bool IsContinuationLine(string line)
|
||
{
|
||
var trimmed = line.Trim();
|
||
|
||
// Skip parenthetical notes
|
||
if (trimmed.StartsWith("(", StringComparison.Ordinal) && trimmed.EndsWith(")", StringComparison.Ordinal))
|
||
return true;
|
||
|
||
// Skip lines that are clearly continuation text (start with lowercase, common continuation words)
|
||
if (trimmed.Length > 0 && char.IsLower(trimmed[0]))
|
||
{
|
||
// Check if it starts with common continuation words
|
||
var continuationPrefixes = new[] { "be ", "the ", "and ", "or ", "to ", "a ", "an ", "will ", "may ", "can " };
|
||
foreach (var prefix in continuationPrefixes)
|
||
{
|
||
if (trimmed.StartsWith(prefix, StringComparison.OrdinalIgnoreCase))
|
||
return true;
|
||
}
|
||
}
|
||
|
||
// Skip informational lines that don't contain dates/times
|
||
if (trimmed.Contains("Schedule Posted", StringComparison.OrdinalIgnoreCase) ||
|
||
trimmed.Contains("Note:", StringComparison.OrdinalIgnoreCase) ||
|
||
trimmed.Contains("*Note:", StringComparison.OrdinalIgnoreCase))
|
||
return true;
|
||
|
||
return false;
|
||
}
|
||
|
||
private string SanitizeInput(string input)
|
||
{
|
||
|
||
input = input.Replace("–", "-");
|
||
input = input.Replace("—", "-");
|
||
|
||
return input;
|
||
}
|
||
|
||
private DateOnly ParseDate(string month, string dayOfMonth, int year)
|
||
{
|
||
// Use normalized MonthNames array from grammar
|
||
var monthLower = month.ToLower();
|
||
var monthIndex = Array.FindIndex(EventOccurrenceGrammar.MonthNames,
|
||
m => m.ToLower() == monthLower);
|
||
|
||
if (monthIndex < 0)
|
||
throw new ArgumentException($"Invalid month: {month}", nameof(month));
|
||
|
||
// Month index is 0-based, month number is 1-based
|
||
int monthNum = monthIndex + 1;
|
||
var day = int.Parse(dayOfMonth);
|
||
return new DateOnly(year, monthNum, day);
|
||
}
|
||
|
||
/// <summary>
|
||
/// Parses time and location from the timeAndLocation string using configurable location patterns.
|
||
/// </summary>
|
||
private (string time, string location, bool locationParseSuccess) ParseTimeAndLocation(
|
||
string timeAndLocation,
|
||
int lineNumber,
|
||
string lineContent,
|
||
List<ParsingIssue> issues)
|
||
{
|
||
var time = timeAndLocation;
|
||
var location = string.Empty;
|
||
var locationParseSuccess = false;
|
||
|
||
// First, try to separate time from location using the time regex
|
||
var timeLocationMatch = _timeLocationRegex.Match(timeAndLocation);
|
||
|
||
if (timeLocationMatch.Success)
|
||
{
|
||
time = timeLocationMatch.Groups["Time"].Captures[0].Value.Trim();
|
||
var locationPart = timeLocationMatch.Groups["Location"].Success
|
||
? timeLocationMatch.Groups["Location"].Captures[0].Value.Trim()
|
||
: string.Empty;
|
||
|
||
if (!string.IsNullOrWhiteSpace(locationPart))
|
||
{
|
||
// Clean up location part - remove any remaining time components (e.g., "– 12:15 p.m. Exhibit Hall C" -> "Exhibit Hall C")
|
||
locationPart = CleanLocationText(locationPart);
|
||
|
||
if (!string.IsNullOrWhiteSpace(locationPart))
|
||
{
|
||
// Try to match location using configurable patterns
|
||
if (_locationConfig != null && _locationConfig.LocationPatterns.Any())
|
||
{
|
||
location = MatchLocationPattern(locationPart, _locationConfig.LocationPatterns);
|
||
locationParseSuccess = !string.IsNullOrEmpty(location);
|
||
|
||
// If pattern matching failed but location part looks valid, try matching against cleaned version
|
||
if (!locationParseSuccess && !string.IsNullOrWhiteSpace(locationPart))
|
||
{
|
||
// Some locations might not match because of extra whitespace or formatting
|
||
// Try matching the location even if initial match failed
|
||
var cleanedForMatching = locationPart.Trim();
|
||
location = MatchLocationPattern(cleanedForMatching, _locationConfig.LocationPatterns);
|
||
locationParseSuccess = !string.IsNullOrEmpty(location);
|
||
if (locationParseSuccess)
|
||
{
|
||
location = cleanedForMatching; // Use the cleaned version
|
||
}
|
||
}
|
||
}
|
||
|
||
// If no pattern matched but we have a location, use it anyway but mark as not matching pattern
|
||
// This allows parsing to continue while still tracking that the location didn't match a pattern
|
||
if (!locationParseSuccess && !string.IsNullOrWhiteSpace(locationPart))
|
||
{
|
||
location = locationPart;
|
||
// Only add issue if we have patterns configured but none matched
|
||
// This helps identify locations that might need new patterns added
|
||
if (_locationConfig != null && _locationConfig.LocationPatterns.Any())
|
||
{
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = lineNumber,
|
||
LineContent = lineContent,
|
||
IssueType = ParsingIssueType.LocationParseFailure,
|
||
Message = $"Location '{locationPart}' does not match any configured pattern"
|
||
});
|
||
}
|
||
}
|
||
}
|
||
}
|
||
else
|
||
{
|
||
// No location part found, which is valid (some events might not have locations)
|
||
locationParseSuccess = true; // Consider it a success since no location is needed
|
||
}
|
||
}
|
||
else
|
||
{
|
||
// If time regex doesn't match, use the whole string as time
|
||
time = timeAndLocation.Trim();
|
||
}
|
||
|
||
return (time, location, locationParseSuccess || string.IsNullOrWhiteSpace(location));
|
||
}
|
||
|
||
/// <summary>
|
||
/// Cleans location text by removing any remaining time components.
|
||
/// Handles cases like "– 12:15 p.m. Exhibit Hall C" -> "Exhibit Hall C"
|
||
/// </summary>
|
||
private string CleanLocationText(string locationText)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(locationText))
|
||
return string.Empty;
|
||
|
||
// Remove leading dashes and whitespace
|
||
// Note: Input is normalized, so only regular hyphens need to be handled
|
||
locationText = locationText.TrimStart('-', ' ', '\t');
|
||
|
||
// Try to match and remove time patterns at the start
|
||
// Pattern 1: Dash, whitespace, time (e.g., "- 12:15 p.m. " or "- NOON ")
|
||
// Note: Input is normalized, so only regular hyphens need to be handled
|
||
var dashTimePattern = new Regex(@"^-\s+(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)\s+", RegexOptions.IgnoreCase);
|
||
locationText = dashTimePattern.Replace(locationText, "").Trim();
|
||
|
||
// Pattern 2: Time without dash at start (e.g., "12:15 p.m. " or "NOON ")
|
||
var timePatternAtStart = new Regex(@"^(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)\s+", RegexOptions.IgnoreCase);
|
||
locationText = timePatternAtStart.Replace(locationText, "").Trim();
|
||
|
||
// Pattern 3: Any remaining dash-time combinations (more flexible)
|
||
// Note: Input is normalized, so only regular hyphens need to be handled
|
||
var remainingDashTime = new Regex(@"^-\s*(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)\s*", RegexOptions.IgnoreCase);
|
||
locationText = remainingDashTime.Replace(locationText, "").Trim();
|
||
|
||
// Pattern 4: Remove any standalone time at the start (handles cases where dash was already removed)
|
||
var standaloneTime = new Regex(@"^(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)$", RegexOptions.IgnoreCase);
|
||
if (standaloneTime.IsMatch(locationText))
|
||
return string.Empty; // If only time remains, there's no location
|
||
|
||
return locationText.Trim();
|
||
}
|
||
|
||
/// <summary>
|
||
/// Matches location text against configured patterns and returns the matched location.
|
||
/// </summary>
|
||
private string MatchLocationPattern(string locationText, List<string> patterns)
|
||
{
|
||
// Normalize location text for matching (trim and handle variations)
|
||
var normalizedLocation = locationText.Trim();
|
||
|
||
// If location is empty after normalization, return empty
|
||
if (string.IsNullOrWhiteSpace(normalizedLocation))
|
||
return string.Empty;
|
||
|
||
foreach (var pattern in patterns)
|
||
{
|
||
var normalizedPattern = pattern.Trim();
|
||
|
||
// Skip empty patterns
|
||
if (string.IsNullOrWhiteSpace(normalizedPattern))
|
||
continue;
|
||
|
||
// Handle exact matches (patterns without wildcards like "Online", "Virtual", "TBD")
|
||
if (!normalizedPattern.Contains('*'))
|
||
{
|
||
if (string.Equals(normalizedPattern, normalizedLocation, StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
return normalizedLocation;
|
||
}
|
||
continue;
|
||
}
|
||
|
||
// Convert pattern to regex: escape special chars, replace * with .*
|
||
// This handles patterns like "Exhibit Hall *", "Room *", "Mtg. Room *", etc.
|
||
var escapedPattern = Regex.Escape(normalizedPattern);
|
||
escapedPattern = escapedPattern.Replace(@"\*", ".*?");
|
||
|
||
// Use case-insensitive matching
|
||
var regex = new Regex($"^{escapedPattern}$", RegexOptions.IgnoreCase);
|
||
if (regex.IsMatch(normalizedLocation))
|
||
{
|
||
return normalizedLocation; // Return the full matched location
|
||
}
|
||
}
|
||
|
||
return string.Empty;
|
||
}
|
||
|
||
private TimeOnly ParseStartTime(string time)
|
||
{
|
||
int hour = 0;
|
||
int minute = 0;
|
||
|
||
// Handle TBD (To Be Determined) times gracefully
|
||
if (string.Equals(time.Trim(), "TBD", StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
// Use a placeholder time (midnight) for TBD - the occurrence will still be created
|
||
// but with a time that indicates it's TBD
|
||
return new TimeOnly(0, 0, 0);
|
||
}
|
||
|
||
// get the part of the time before a timespan
|
||
if (time.Contains(" - "))
|
||
{
|
||
time = time[..time.IndexOf(" - ", StringComparison.Ordinal)];
|
||
|
||
}
|
||
|
||
if (time == "NOON")
|
||
hour = 12;
|
||
else
|
||
{
|
||
// Regex is case-insensitive, so ToLower() is not needed
|
||
var timeMatch = _timeRe.Match(time);
|
||
if (timeMatch.Success)
|
||
{
|
||
hour = int.Parse(timeMatch.Groups["Hour"].Captures[0].Value);
|
||
if (timeMatch.Groups["Minute"].Success)
|
||
{
|
||
minute = int.Parse(timeMatch.Groups["Minute"].Captures[0].Value);
|
||
}
|
||
|
||
if (timeMatch.Groups["APM"].Captures[0].Value is "p.m." or "pm" && hour < 12)
|
||
hour += 12;
|
||
}
|
||
else
|
||
{
|
||
throw new FormatException($"Time format not recognized: {time}");
|
||
}
|
||
}
|
||
|
||
return new TimeOnly(hour, minute, 0);
|
||
}
|
||
} |