544 lines
20 KiB
C#
544 lines
20 KiB
C#
using System.Text.RegularExpressions;
|
||
using Core.Entities;
|
||
using Core.Models;
|
||
using FuzzySharp;
|
||
|
||
namespace Core.Parsers;
|
||
|
||
/// <summary>
|
||
/// Result of parsing event occurrence file, containing both occurrences and parsing issues.
|
||
/// </summary>
|
||
public class EventOccurrenceParserResult
|
||
{
|
||
public IDictionary<EventDefinition, List<EventOccurrence>> Occurrences { get; set; } = new Dictionary<EventDefinition, List<EventOccurrence>>();
|
||
public List<ParsingIssue> Issues { get; set; } = new();
|
||
}
|
||
|
||
public class EventOccurrenceParser
|
||
{
|
||
private FileSystemInfo _txtFile;
|
||
private ICollection<EventDefinition> _events;
|
||
private LocationParsingConfiguration? _locationConfig;
|
||
|
||
public EventOccurrenceParser(FileSystemInfo txtFile, ICollection<EventDefinition> events, LocationParsingConfiguration? locationConfig = null)
|
||
{
|
||
_events = events;
|
||
_txtFile = txtFile;
|
||
_locationConfig = locationConfig;
|
||
}
|
||
|
||
private Regex _re =
|
||
new (
|
||
@"" + //
|
||
@"(?<Name>^[^#].*)\s" +
|
||
@"(?<Month>January|February|March|April|May|June|July|August|September|October|November|December)\s" +
|
||
@"(?<DayOfMonth>\d{1,2});?\s" +
|
||
@"(?<TimeAndLocation>.*)"
|
||
);
|
||
|
||
private readonly Regex _timeRe = new(@"(?<Hour>\d{1,2}):?(?<Minute>\d{2})?\s?(?<APM>(?:a|p)\.?m\.?)");
|
||
|
||
// Regex to match time ranges like "10:30 a.m. - 12:00 p.m." or "10:30 a.m. - NOON"
|
||
// Matches: time1 (optional dash time2/NOON), then location
|
||
// The time group captures the full time range (including " - NOON" if present)
|
||
// Pattern breakdown:
|
||
// - First time: (?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?)) - matches NOON or time with AM/PM (more flexible whitespace)
|
||
// - Optional range: (?:\s*[–-]\s*(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?))) - matches dash followed by NOON or time
|
||
// - Location: (?:\s+(?<Location>.+))? - optional whitespace followed by location (capture group with explicit name)
|
||
private readonly Regex _timeLocationRegex = new(@"(?<Time>(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?))(?:\s*[–-]\s*(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?)))?)(?:\s+(?<Location>.+))?");
|
||
|
||
public EventOccurrenceParserResult Parse()
|
||
{
|
||
var result = new EventOccurrenceParserResult();
|
||
var occurrences = result.Occurrences;
|
||
var issues = result.Issues;
|
||
EventDefinition? currentEventDefinition = null;
|
||
|
||
var lines = File.ReadLines(_txtFile.FullName);
|
||
foreach (var (line, index) in lines.Select((line, index) => (line, index + 1)))
|
||
{
|
||
var trimmedLine = line.Trim();
|
||
|
||
// Skip empty lines
|
||
if (string.IsNullOrWhiteSpace(trimmedLine))
|
||
continue;
|
||
|
||
// Skip comment lines (starting with "#") - use grammar parser
|
||
if (EventOccurrenceGrammar.IsCommentLine(trimmedLine))
|
||
continue;
|
||
|
||
var match = _re.Match(trimmedLine);
|
||
if (!match.Success)
|
||
{
|
||
// Try to parse section header using grammar parser
|
||
var sectionHeader = EventOccurrenceGrammar.TryParseSectionHeader(trimmedLine);
|
||
if (sectionHeader.HasValue)
|
||
{
|
||
var (eventNamePart, schoolLevel) = sectionHeader.Value;
|
||
|
||
// Use fuzzy matching to find the best matching event definition
|
||
var evt =
|
||
(from e in _events
|
||
let rat = Fuzz.Ratio(e.Name, eventNamePart)
|
||
where rat > 50
|
||
orderby rat descending
|
||
select e).FirstOrDefault();
|
||
if (evt == null)
|
||
{
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = index,
|
||
LineContent = trimmedLine,
|
||
IssueType = ParsingIssueType.UnmatchedLine,
|
||
Message = $"Section header '{eventNamePart} – {schoolLevel}' found but no matching event definition (best match ratio: {Fuzz.Ratio(eventNamePart, _events.FirstOrDefault()?.Name ?? "")})"
|
||
});
|
||
continue;
|
||
}
|
||
currentEventDefinition = evt;
|
||
continue;
|
||
}
|
||
|
||
// Check for General Schedule/Session using grammar parser
|
||
if (EventOccurrenceGrammar.IsGeneralSchedule(trimmedLine))
|
||
{
|
||
currentEventDefinition = EventDefinition.GeneralSchedule;
|
||
continue;
|
||
}
|
||
|
||
// Also check for simple "MS" or "HS" in line (backward compatibility)
|
||
if (trimmedLine.Contains("MS") || trimmedLine.Contains("HS"))
|
||
{
|
||
var evt =
|
||
(from e in _events
|
||
let rat = Fuzz.Ratio(e.Name, trimmedLine)
|
||
where rat > 50
|
||
orderby rat descending
|
||
select e).FirstOrDefault();
|
||
if (evt == null)
|
||
{
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = index,
|
||
LineContent = trimmedLine,
|
||
IssueType = ParsingIssueType.UnmatchedLine,
|
||
Message = $"Section header with 'MS' or 'HS' found but no matching event definition (best match ratio: {Fuzz.Ratio(trimmedLine, _events.FirstOrDefault()?.Name ?? "")})"
|
||
});
|
||
continue;
|
||
}
|
||
currentEventDefinition = evt;
|
||
continue;
|
||
}
|
||
|
||
// Skip continuation lines (lines that look like they're continuing from previous line)
|
||
// These are typically lines that:
|
||
// - Start with lowercase or special characters (not event names)
|
||
// - Are parenthetical notes like "(Semifinalists only)"
|
||
// - Are informational text like "Schedule Posted on..."
|
||
if (IsContinuationLine(trimmedLine))
|
||
{
|
||
continue;
|
||
}
|
||
|
||
// "Voting Delegates" section header is no longer used - occurrences are categorized by name pattern
|
||
// Track as unmatched line if it's not empty
|
||
if (!string.IsNullOrWhiteSpace(trimmedLine))
|
||
{
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = index,
|
||
LineContent = trimmedLine,
|
||
IssueType = ParsingIssueType.UnmatchedLine,
|
||
Message = "Line does not match expected format (Name Month Day Time/Location)"
|
||
});
|
||
}
|
||
continue;
|
||
}
|
||
|
||
var occurrenceName = match.Groups["Name"].Captures[0].Value;
|
||
var month = match.Groups["Month"].Captures[0].Value;
|
||
var dayOfMonth = match.Groups["DayOfMonth"].Captures[0].Value;
|
||
var timeAndLocation = match.Groups["TimeAndLocation"].Captures[0].Value;
|
||
|
||
occurrenceName = Regex.Replace(occurrenceName,
|
||
@"(?<Weekday>Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s?$", "").Trim();
|
||
|
||
// Determine event definition based on occurrence name pattern or current section
|
||
EventDefinition? eventDefinition = DetermineEventDefinition(occurrenceName, currentEventDefinition);
|
||
|
||
// Track issue if we can't determine the event definition
|
||
if (eventDefinition == null)
|
||
{
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = index,
|
||
LineContent = trimmedLine,
|
||
IssueType = ParsingIssueType.MissingEventDefinition,
|
||
Message = $"Cannot determine event definition for occurrence: {occurrenceName}"
|
||
});
|
||
continue;
|
||
}
|
||
|
||
timeAndLocation = SanitizeInput(timeAndLocation);
|
||
|
||
// Parse time and location using configurable patterns
|
||
var (time, location, locationParseSuccess) = ParseTimeAndLocation(timeAndLocation, index, trimmedLine, issues);
|
||
|
||
// Parse date
|
||
DateOnly? startDate = null;
|
||
try
|
||
{
|
||
startDate = ParseDate(month, dayOfMonth, DateTime.Now.Year);
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = index,
|
||
LineContent = trimmedLine,
|
||
IssueType = ParsingIssueType.DateParseFailure,
|
||
Message = $"Failed to parse date: {ex.Message}"
|
||
});
|
||
continue;
|
||
}
|
||
|
||
// Parse time
|
||
TimeOnly? startTime = null;
|
||
try
|
||
{
|
||
startTime = ParseStartTime(time);
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = index,
|
||
LineContent = trimmedLine,
|
||
IssueType = ParsingIssueType.TimeParseFailure,
|
||
Message = $"Failed to parse time '{time}': {ex.Message}"
|
||
});
|
||
continue;
|
||
}
|
||
|
||
if (startDate == null || startTime == null)
|
||
continue;
|
||
|
||
var t = new DateTime(startDate.Value, startTime.Value);
|
||
|
||
var eventOccurrence = new EventOccurrence
|
||
{
|
||
Name = occurrenceName,
|
||
StartTime = t,
|
||
Time = $"{time}",
|
||
Date = $"{month} {dayOfMonth}",
|
||
Location = location
|
||
};
|
||
|
||
if (!occurrences.ContainsKey(eventDefinition))
|
||
occurrences.Add(eventDefinition, []);
|
||
occurrences[eventDefinition].Add(eventOccurrence);
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
/// <summary>
|
||
/// Determines the EventDefinition for an occurrence based on its name pattern or current section context.
|
||
/// </summary>
|
||
private EventDefinition? DetermineEventDefinition(string occurrenceName, EventDefinition? currentEventDefinition)
|
||
{
|
||
// Check for special event name patterns first (regardless of current section)
|
||
if (occurrenceName.Contains("Meet the Candidates Session", StringComparison.OrdinalIgnoreCase))
|
||
return EventDefinition.MeetTheCandidates;
|
||
|
||
if (occurrenceName.Contains("Chapter Officer Meeting", StringComparison.OrdinalIgnoreCase))
|
||
return EventDefinition.ChapterOfficerMeeting;
|
||
|
||
if (occurrenceName.Contains("Voting Delegate Meeting", StringComparison.OrdinalIgnoreCase))
|
||
return EventDefinition.VotingDelegateMeeting;
|
||
|
||
// If we're in a General Schedule/Session section and no pattern matched, use GeneralSchedule
|
||
if (currentEventDefinition == EventDefinition.GeneralSchedule)
|
||
return EventDefinition.GeneralSchedule;
|
||
|
||
// If we have a current event definition from section header (e.g., regular events), use it
|
||
if (currentEventDefinition != null)
|
||
return currentEventDefinition;
|
||
|
||
// Cannot determine event definition
|
||
return null;
|
||
}
|
||
|
||
/// <summary>
|
||
/// Determines if a line is a continuation/wrapped line that should be skipped.
|
||
/// </summary>
|
||
private bool IsContinuationLine(string line)
|
||
{
|
||
var trimmed = line.Trim();
|
||
|
||
// Skip parenthetical notes
|
||
if (trimmed.StartsWith("(", StringComparison.Ordinal) && trimmed.EndsWith(")", StringComparison.Ordinal))
|
||
return true;
|
||
|
||
// Skip lines that are clearly continuation text (start with lowercase, common continuation words)
|
||
if (trimmed.Length > 0 && char.IsLower(trimmed[0]))
|
||
{
|
||
// Check if it starts with common continuation words
|
||
var continuationPrefixes = new[] { "be ", "the ", "and ", "or ", "to ", "a ", "an ", "will ", "may ", "can " };
|
||
foreach (var prefix in continuationPrefixes)
|
||
{
|
||
if (trimmed.StartsWith(prefix, StringComparison.OrdinalIgnoreCase))
|
||
return true;
|
||
}
|
||
}
|
||
|
||
// Skip informational lines that don't contain dates/times
|
||
if (trimmed.Contains("Schedule Posted", StringComparison.OrdinalIgnoreCase) ||
|
||
trimmed.Contains("Note:", StringComparison.OrdinalIgnoreCase) ||
|
||
trimmed.Contains("*Note:", StringComparison.OrdinalIgnoreCase))
|
||
return true;
|
||
|
||
return false;
|
||
}
|
||
|
||
private string SanitizeInput(string input)
|
||
{
|
||
|
||
input = input.Replace("–", "-");
|
||
input = input.Replace("—", "-");
|
||
|
||
return input;
|
||
}
|
||
|
||
private DateOnly ParseDate(string month, string dayOfMonth, int year)
|
||
{
|
||
int monthNum = month.ToLower() switch
|
||
{
|
||
"january" => 1,
|
||
"february" => 2,
|
||
"march" => 3,
|
||
"april" => 4,
|
||
"may" => 5,
|
||
"june" => 6,
|
||
"july" => 7,
|
||
"august" => 8,
|
||
"september" => 9,
|
||
"october" => 10,
|
||
"november" => 11,
|
||
"december" => 12,
|
||
_ => throw new ArgumentException($"Invalid month: {month}", nameof(month))
|
||
};
|
||
|
||
var day = int.Parse(dayOfMonth);
|
||
return new DateOnly(year, monthNum, day);
|
||
}
|
||
|
||
/// <summary>
|
||
/// Parses time and location from the timeAndLocation string using configurable location patterns.
|
||
/// </summary>
|
||
private (string time, string location, bool locationParseSuccess) ParseTimeAndLocation(
|
||
string timeAndLocation,
|
||
int lineNumber,
|
||
string lineContent,
|
||
List<ParsingIssue> issues)
|
||
{
|
||
var time = timeAndLocation;
|
||
var location = string.Empty;
|
||
var locationParseSuccess = false;
|
||
|
||
// First, try to separate time from location using the time regex
|
||
var timeLocationMatch = _timeLocationRegex.Match(timeAndLocation);
|
||
|
||
if (timeLocationMatch.Success)
|
||
{
|
||
time = timeLocationMatch.Groups["Time"].Captures[0].Value.Trim();
|
||
var locationPart = timeLocationMatch.Groups["Location"].Success
|
||
? timeLocationMatch.Groups["Location"].Captures[0].Value.Trim()
|
||
: string.Empty;
|
||
|
||
if (!string.IsNullOrWhiteSpace(locationPart))
|
||
{
|
||
// Clean up location part - remove any remaining time components (e.g., "– 12:15 p.m. Exhibit Hall C" -> "Exhibit Hall C")
|
||
locationPart = CleanLocationText(locationPart);
|
||
|
||
if (!string.IsNullOrWhiteSpace(locationPart))
|
||
{
|
||
// Try to match location using configurable patterns
|
||
if (_locationConfig != null && _locationConfig.LocationPatterns.Any())
|
||
{
|
||
location = MatchLocationPattern(locationPart, _locationConfig.LocationPatterns);
|
||
locationParseSuccess = !string.IsNullOrEmpty(location);
|
||
|
||
// If pattern matching failed but location part looks valid, try matching against cleaned version
|
||
if (!locationParseSuccess && !string.IsNullOrWhiteSpace(locationPart))
|
||
{
|
||
// Some locations might not match because of extra whitespace or formatting
|
||
// Try matching the location even if initial match failed
|
||
var cleanedForMatching = locationPart.Trim();
|
||
location = MatchLocationPattern(cleanedForMatching, _locationConfig.LocationPatterns);
|
||
locationParseSuccess = !string.IsNullOrEmpty(location);
|
||
if (locationParseSuccess)
|
||
{
|
||
location = cleanedForMatching; // Use the cleaned version
|
||
}
|
||
}
|
||
}
|
||
|
||
// If no pattern matched but we have a location, use it anyway but mark as not matching pattern
|
||
// This allows parsing to continue while still tracking that the location didn't match a pattern
|
||
if (!locationParseSuccess && !string.IsNullOrWhiteSpace(locationPart))
|
||
{
|
||
location = locationPart;
|
||
// Only add issue if we have patterns configured but none matched
|
||
// This helps identify locations that might need new patterns added
|
||
if (_locationConfig != null && _locationConfig.LocationPatterns.Any())
|
||
{
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = lineNumber,
|
||
LineContent = lineContent,
|
||
IssueType = ParsingIssueType.LocationParseFailure,
|
||
Message = $"Location '{locationPart}' does not match any configured pattern"
|
||
});
|
||
}
|
||
}
|
||
}
|
||
}
|
||
else
|
||
{
|
||
// No location part found, which is valid (some events might not have locations)
|
||
locationParseSuccess = true; // Consider it a success since no location is needed
|
||
}
|
||
}
|
||
else
|
||
{
|
||
// If time regex doesn't match, use the whole string as time
|
||
time = timeAndLocation.Trim();
|
||
}
|
||
|
||
return (time, location, locationParseSuccess || string.IsNullOrWhiteSpace(location));
|
||
}
|
||
|
||
/// <summary>
|
||
/// Cleans location text by removing any remaining time components.
|
||
/// Handles cases like "– 12:15 p.m. Exhibit Hall C" -> "Exhibit Hall C"
|
||
/// </summary>
|
||
private string CleanLocationText(string locationText)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(locationText))
|
||
return string.Empty;
|
||
|
||
// Remove leading dashes and whitespace
|
||
locationText = locationText.TrimStart('–', '-', ' ', '\t');
|
||
|
||
// Try to match and remove time patterns at the start
|
||
// Pattern 1: Dash, whitespace, time (e.g., "– 12:15 p.m. " or "– NOON ")
|
||
var dashTimePattern = new Regex(@"^[–-]\s+(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)\s+", RegexOptions.IgnoreCase);
|
||
locationText = dashTimePattern.Replace(locationText, "").Trim();
|
||
|
||
// Pattern 2: Time without dash at start (e.g., "12:15 p.m. " or "NOON ")
|
||
var timePatternAtStart = new Regex(@"^(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)\s+", RegexOptions.IgnoreCase);
|
||
locationText = timePatternAtStart.Replace(locationText, "").Trim();
|
||
|
||
// Pattern 3: Any remaining dash-time combinations (more flexible)
|
||
var remainingDashTime = new Regex(@"^[–-]\s*(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)\s*", RegexOptions.IgnoreCase);
|
||
locationText = remainingDashTime.Replace(locationText, "").Trim();
|
||
|
||
// Pattern 4: Remove any standalone time at the start (handles cases where dash was already removed)
|
||
var standaloneTime = new Regex(@"^(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)$", RegexOptions.IgnoreCase);
|
||
if (standaloneTime.IsMatch(locationText))
|
||
return string.Empty; // If only time remains, there's no location
|
||
|
||
return locationText.Trim();
|
||
}
|
||
|
||
/// <summary>
|
||
/// Matches location text against configured patterns and returns the matched location.
|
||
/// </summary>
|
||
private string MatchLocationPattern(string locationText, List<string> patterns)
|
||
{
|
||
// Normalize location text for matching (trim and handle variations)
|
||
var normalizedLocation = locationText.Trim();
|
||
|
||
// If location is empty after normalization, return empty
|
||
if (string.IsNullOrWhiteSpace(normalizedLocation))
|
||
return string.Empty;
|
||
|
||
foreach (var pattern in patterns)
|
||
{
|
||
var normalizedPattern = pattern.Trim();
|
||
|
||
// Skip empty patterns
|
||
if (string.IsNullOrWhiteSpace(normalizedPattern))
|
||
continue;
|
||
|
||
// Handle exact matches (patterns without wildcards like "Online", "Virtual", "TBD")
|
||
if (!normalizedPattern.Contains('*'))
|
||
{
|
||
if (string.Equals(normalizedPattern, normalizedLocation, StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
return normalizedLocation;
|
||
}
|
||
continue;
|
||
}
|
||
|
||
// Convert pattern to regex: escape special chars, replace * with .*
|
||
// This handles patterns like "Exhibit Hall *", "Room *", "Mtg. Room *", etc.
|
||
var escapedPattern = Regex.Escape(normalizedPattern);
|
||
escapedPattern = escapedPattern.Replace(@"\*", ".*?");
|
||
|
||
// Use case-insensitive matching
|
||
var regex = new Regex($"^{escapedPattern}$", RegexOptions.IgnoreCase);
|
||
if (regex.IsMatch(normalizedLocation))
|
||
{
|
||
return normalizedLocation; // Return the full matched location
|
||
}
|
||
}
|
||
|
||
return string.Empty;
|
||
}
|
||
|
||
private TimeOnly ParseStartTime(string time)
|
||
{
|
||
int hour = 0;
|
||
int minute = 0;
|
||
|
||
// Handle TBD (To Be Determined) times gracefully
|
||
if (string.Equals(time.Trim(), "TBD", StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
// Use a placeholder time (midnight) for TBD - the occurrence will still be created
|
||
// but with a time that indicates it's TBD
|
||
return new TimeOnly(0, 0, 0);
|
||
}
|
||
|
||
// get the part of the time before a timespan
|
||
if (time.Contains(" - "))
|
||
{
|
||
time = time[..time.IndexOf(" - ", StringComparison.Ordinal)];
|
||
|
||
}
|
||
|
||
if (time == "NOON")
|
||
hour = 12;
|
||
else
|
||
{
|
||
var timeMatch = _timeRe.Match(time.ToLower());
|
||
if (timeMatch.Success)
|
||
{
|
||
hour = int.Parse(timeMatch.Groups["Hour"].Captures[0].Value);
|
||
if (timeMatch.Groups["Minute"].Success)
|
||
{
|
||
minute = int.Parse(timeMatch.Groups["Minute"].Captures[0].Value);
|
||
}
|
||
|
||
if (timeMatch.Groups["APM"].Captures[0].Value is "p.m." or "pm" && hour < 12)
|
||
hour += 12;
|
||
}
|
||
else
|
||
{
|
||
throw new FormatException($"Time format not recognized: {time}");
|
||
}
|
||
}
|
||
|
||
return new TimeOnly(hour, minute, 0);
|
||
}
|
||
} |