using System.Text.RegularExpressions;
using Core.Entities;
using Core.Models;
using FuzzySharp;
namespace Core.Parsers;
///
/// Result of parsing event occurrence file, containing both occurrences and parsing issues.
///
public class EventOccurrenceParserResult
{
public IDictionary> Occurrences { get; set; } = new Dictionary>();
public List Issues { get; set; } = new();
}
public class EventOccurrenceParser
{
private FileSystemInfo _txtFile;
private ICollection _events;
private LocationParsingConfiguration? _locationConfig;
public EventOccurrenceParser(FileSystemInfo txtFile, ICollection events, LocationParsingConfiguration? locationConfig = null)
{
_events = events;
_txtFile = txtFile;
_locationConfig = locationConfig;
}
private Regex _re =
new (
@"" + //
@"(?^[^#].*)\s" +
@"(?January|February|March|April|May|June|July|August|September|October|November|December)\s" +
@"(?\d{1,2});?\s" +
@"(?.*)"
);
private readonly Regex _timeRe = new(@"(?\d{1,2}):?(?\d{2})?\s?(?(?:a|p)\.?m\.?)");
// Regex to match time ranges like "10:30 a.m. - 12:00 p.m." or "10:30 a.m. - NOON"
// Matches: time1 (optional dash time2/NOON), then location
// The time group captures the full time range (including " - NOON" if present)
// Pattern breakdown:
// - First time: (?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?)) - matches NOON or time with AM/PM (more flexible whitespace)
// - Optional range: (?:\s*[–-]\s*(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?))) - matches dash followed by NOON or time
// - Location: (?:\s+(?.+))? - optional whitespace followed by location (capture group with explicit name)
private readonly Regex _timeLocationRegex = new(@"(?(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?))(?:\s*[–-]\s*(?:NOON|\d{1,2}:?\d{0,2}\s*(?:[AaPp]\.?[Mm]\.?)))?)(?:\s+(?.+))?");
public EventOccurrenceParserResult Parse()
{
var result = new EventOccurrenceParserResult();
var occurrences = result.Occurrences;
var issues = result.Issues;
EventDefinition? currentEventDefinition = null;
var lines = File.ReadLines(_txtFile.FullName);
foreach (var (line, index) in lines.Select((line, index) => (line, index + 1)))
{
var trimmedLine = line.Trim();
// Skip empty lines
if (string.IsNullOrWhiteSpace(trimmedLine))
continue;
// Skip comment lines (starting with "#") - use grammar parser
if (EventOccurrenceGrammar.IsCommentLine(trimmedLine))
continue;
var match = _re.Match(trimmedLine);
if (!match.Success)
{
// Try to parse section header using grammar parser
var sectionHeader = EventOccurrenceGrammar.TryParseSectionHeader(trimmedLine);
if (sectionHeader.HasValue)
{
var (eventNamePart, schoolLevel) = sectionHeader.Value;
// Use fuzzy matching to find the best matching event definition
var evt =
(from e in _events
let rat = Fuzz.Ratio(e.Name, eventNamePart)
where rat > 50
orderby rat descending
select e).FirstOrDefault();
if (evt == null)
{
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = trimmedLine,
IssueType = ParsingIssueType.UnmatchedLine,
Message = $"Section header '{eventNamePart} – {schoolLevel}' found but no matching event definition (best match ratio: {Fuzz.Ratio(eventNamePart, _events.FirstOrDefault()?.Name ?? "")})"
});
continue;
}
currentEventDefinition = evt;
continue;
}
// Check for General Schedule/Session using grammar parser
if (EventOccurrenceGrammar.IsGeneralSchedule(trimmedLine))
{
currentEventDefinition = EventDefinition.GeneralSchedule;
continue;
}
// Also check for simple "MS" or "HS" in line (backward compatibility)
if (trimmedLine.Contains("MS") || trimmedLine.Contains("HS"))
{
var evt =
(from e in _events
let rat = Fuzz.Ratio(e.Name, trimmedLine)
where rat > 50
orderby rat descending
select e).FirstOrDefault();
if (evt == null)
{
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = trimmedLine,
IssueType = ParsingIssueType.UnmatchedLine,
Message = $"Section header with 'MS' or 'HS' found but no matching event definition (best match ratio: {Fuzz.Ratio(trimmedLine, _events.FirstOrDefault()?.Name ?? "")})"
});
continue;
}
currentEventDefinition = evt;
continue;
}
// Skip continuation lines (lines that look like they're continuing from previous line)
// These are typically lines that:
// - Start with lowercase or special characters (not event names)
// - Are parenthetical notes like "(Semifinalists only)"
// - Are informational text like "Schedule Posted on..."
if (IsContinuationLine(trimmedLine))
{
continue;
}
// "Voting Delegates" section header is no longer used - occurrences are categorized by name pattern
// Track as unmatched line if it's not empty
if (!string.IsNullOrWhiteSpace(trimmedLine))
{
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = trimmedLine,
IssueType = ParsingIssueType.UnmatchedLine,
Message = "Line does not match expected format (Name Month Day Time/Location)"
});
}
continue;
}
var occurrenceName = match.Groups["Name"].Captures[0].Value;
var month = match.Groups["Month"].Captures[0].Value;
var dayOfMonth = match.Groups["DayOfMonth"].Captures[0].Value;
var timeAndLocation = match.Groups["TimeAndLocation"].Captures[0].Value;
occurrenceName = Regex.Replace(occurrenceName,
@"(?Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s?$", "").Trim();
// Determine event definition based on occurrence name pattern or current section
EventDefinition? eventDefinition = DetermineEventDefinition(occurrenceName, currentEventDefinition);
// Track issue if we can't determine the event definition
if (eventDefinition == null)
{
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = trimmedLine,
IssueType = ParsingIssueType.MissingEventDefinition,
Message = $"Cannot determine event definition for occurrence: {occurrenceName}"
});
continue;
}
timeAndLocation = SanitizeInput(timeAndLocation);
// Parse time and location using configurable patterns
var (time, location, locationParseSuccess) = ParseTimeAndLocation(timeAndLocation, index, trimmedLine, issues);
// Parse date
DateOnly? startDate = null;
try
{
startDate = ParseDate(month, dayOfMonth, DateTime.Now.Year);
}
catch (Exception ex)
{
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = trimmedLine,
IssueType = ParsingIssueType.DateParseFailure,
Message = $"Failed to parse date: {ex.Message}"
});
continue;
}
// Parse time
TimeOnly? startTime = null;
try
{
startTime = ParseStartTime(time);
}
catch (Exception ex)
{
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = trimmedLine,
IssueType = ParsingIssueType.TimeParseFailure,
Message = $"Failed to parse time '{time}': {ex.Message}"
});
continue;
}
if (startDate == null || startTime == null)
continue;
var t = new DateTime(startDate.Value, startTime.Value);
var eventOccurrence = new EventOccurrence
{
Name = occurrenceName,
StartTime = t,
Time = $"{time}",
Date = $"{month} {dayOfMonth}",
Location = location
};
if (!occurrences.ContainsKey(eventDefinition))
occurrences.Add(eventDefinition, []);
occurrences[eventDefinition].Add(eventOccurrence);
}
return result;
}
///
/// Determines the EventDefinition for an occurrence based on its name pattern or current section context.
///
private EventDefinition? DetermineEventDefinition(string occurrenceName, EventDefinition? currentEventDefinition)
{
// Check for special event name patterns first (regardless of current section)
if (occurrenceName.Contains("Meet the Candidates Session", StringComparison.OrdinalIgnoreCase))
return EventDefinition.MeetTheCandidates;
if (occurrenceName.Contains("Chapter Officer Meeting", StringComparison.OrdinalIgnoreCase))
return EventDefinition.ChapterOfficerMeeting;
if (occurrenceName.Contains("Voting Delegate Meeting", StringComparison.OrdinalIgnoreCase))
return EventDefinition.VotingDelegateMeeting;
// If we're in a General Schedule/Session section and no pattern matched, use GeneralSchedule
if (currentEventDefinition == EventDefinition.GeneralSchedule)
return EventDefinition.GeneralSchedule;
// If we have a current event definition from section header (e.g., regular events), use it
if (currentEventDefinition != null)
return currentEventDefinition;
// Cannot determine event definition
return null;
}
///
/// Determines if a line is a continuation/wrapped line that should be skipped.
///
private bool IsContinuationLine(string line)
{
var trimmed = line.Trim();
// Skip parenthetical notes
if (trimmed.StartsWith("(", StringComparison.Ordinal) && trimmed.EndsWith(")", StringComparison.Ordinal))
return true;
// Skip lines that are clearly continuation text (start with lowercase, common continuation words)
if (trimmed.Length > 0 && char.IsLower(trimmed[0]))
{
// Check if it starts with common continuation words
var continuationPrefixes = new[] { "be ", "the ", "and ", "or ", "to ", "a ", "an ", "will ", "may ", "can " };
foreach (var prefix in continuationPrefixes)
{
if (trimmed.StartsWith(prefix, StringComparison.OrdinalIgnoreCase))
return true;
}
}
// Skip informational lines that don't contain dates/times
if (trimmed.Contains("Schedule Posted", StringComparison.OrdinalIgnoreCase) ||
trimmed.Contains("Note:", StringComparison.OrdinalIgnoreCase) ||
trimmed.Contains("*Note:", StringComparison.OrdinalIgnoreCase))
return true;
return false;
}
private string SanitizeInput(string input)
{
input = input.Replace("–", "-");
input = input.Replace("—", "-");
return input;
}
private DateOnly ParseDate(string month, string dayOfMonth, int year)
{
int monthNum = month.ToLower() switch
{
"january" => 1,
"february" => 2,
"march" => 3,
"april" => 4,
"may" => 5,
"june" => 6,
"july" => 7,
"august" => 8,
"september" => 9,
"october" => 10,
"november" => 11,
"december" => 12,
_ => throw new ArgumentException($"Invalid month: {month}", nameof(month))
};
var day = int.Parse(dayOfMonth);
return new DateOnly(year, monthNum, day);
}
///
/// Parses time and location from the timeAndLocation string using configurable location patterns.
///
private (string time, string location, bool locationParseSuccess) ParseTimeAndLocation(
string timeAndLocation,
int lineNumber,
string lineContent,
List issues)
{
var time = timeAndLocation;
var location = string.Empty;
var locationParseSuccess = false;
// First, try to separate time from location using the time regex
var timeLocationMatch = _timeLocationRegex.Match(timeAndLocation);
if (timeLocationMatch.Success)
{
time = timeLocationMatch.Groups["Time"].Captures[0].Value.Trim();
var locationPart = timeLocationMatch.Groups["Location"].Success
? timeLocationMatch.Groups["Location"].Captures[0].Value.Trim()
: string.Empty;
if (!string.IsNullOrWhiteSpace(locationPart))
{
// Clean up location part - remove any remaining time components (e.g., "– 12:15 p.m. Exhibit Hall C" -> "Exhibit Hall C")
locationPart = CleanLocationText(locationPart);
if (!string.IsNullOrWhiteSpace(locationPart))
{
// Try to match location using configurable patterns
if (_locationConfig != null && _locationConfig.LocationPatterns.Any())
{
location = MatchLocationPattern(locationPart, _locationConfig.LocationPatterns);
locationParseSuccess = !string.IsNullOrEmpty(location);
// If pattern matching failed but location part looks valid, try matching against cleaned version
if (!locationParseSuccess && !string.IsNullOrWhiteSpace(locationPart))
{
// Some locations might not match because of extra whitespace or formatting
// Try matching the location even if initial match failed
var cleanedForMatching = locationPart.Trim();
location = MatchLocationPattern(cleanedForMatching, _locationConfig.LocationPatterns);
locationParseSuccess = !string.IsNullOrEmpty(location);
if (locationParseSuccess)
{
location = cleanedForMatching; // Use the cleaned version
}
}
}
// If no pattern matched but we have a location, use it anyway but mark as not matching pattern
// This allows parsing to continue while still tracking that the location didn't match a pattern
if (!locationParseSuccess && !string.IsNullOrWhiteSpace(locationPart))
{
location = locationPart;
// Only add issue if we have patterns configured but none matched
// This helps identify locations that might need new patterns added
if (_locationConfig != null && _locationConfig.LocationPatterns.Any())
{
issues.Add(new ParsingIssue
{
LineNumber = lineNumber,
LineContent = lineContent,
IssueType = ParsingIssueType.LocationParseFailure,
Message = $"Location '{locationPart}' does not match any configured pattern"
});
}
}
}
}
else
{
// No location part found, which is valid (some events might not have locations)
locationParseSuccess = true; // Consider it a success since no location is needed
}
}
else
{
// If time regex doesn't match, use the whole string as time
time = timeAndLocation.Trim();
}
return (time, location, locationParseSuccess || string.IsNullOrWhiteSpace(location));
}
///
/// Cleans location text by removing any remaining time components.
/// Handles cases like "– 12:15 p.m. Exhibit Hall C" -> "Exhibit Hall C"
///
private string CleanLocationText(string locationText)
{
if (string.IsNullOrWhiteSpace(locationText))
return string.Empty;
// Remove leading dashes and whitespace
locationText = locationText.TrimStart('–', '-', ' ', '\t');
// Try to match and remove time patterns at the start
// Pattern 1: Dash, whitespace, time (e.g., "– 12:15 p.m. " or "– NOON ")
var dashTimePattern = new Regex(@"^[–-]\s+(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)\s+", RegexOptions.IgnoreCase);
locationText = dashTimePattern.Replace(locationText, "").Trim();
// Pattern 2: Time without dash at start (e.g., "12:15 p.m. " or "NOON ")
var timePatternAtStart = new Regex(@"^(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)\s+", RegexOptions.IgnoreCase);
locationText = timePatternAtStart.Replace(locationText, "").Trim();
// Pattern 3: Any remaining dash-time combinations (more flexible)
var remainingDashTime = new Regex(@"^[–-]\s*(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)\s*", RegexOptions.IgnoreCase);
locationText = remainingDashTime.Replace(locationText, "").Trim();
// Pattern 4: Remove any standalone time at the start (handles cases where dash was already removed)
var standaloneTime = new Regex(@"^(?:NOON|\d{1,2}:?\d{0,2}\s*[AaPp]\.?[Mm]\.?)$", RegexOptions.IgnoreCase);
if (standaloneTime.IsMatch(locationText))
return string.Empty; // If only time remains, there's no location
return locationText.Trim();
}
///
/// Matches location text against configured patterns and returns the matched location.
///
private string MatchLocationPattern(string locationText, List patterns)
{
// Normalize location text for matching (trim and handle variations)
var normalizedLocation = locationText.Trim();
// If location is empty after normalization, return empty
if (string.IsNullOrWhiteSpace(normalizedLocation))
return string.Empty;
foreach (var pattern in patterns)
{
var normalizedPattern = pattern.Trim();
// Skip empty patterns
if (string.IsNullOrWhiteSpace(normalizedPattern))
continue;
// Handle exact matches (patterns without wildcards like "Online", "Virtual", "TBD")
if (!normalizedPattern.Contains('*'))
{
if (string.Equals(normalizedPattern, normalizedLocation, StringComparison.OrdinalIgnoreCase))
{
return normalizedLocation;
}
continue;
}
// Convert pattern to regex: escape special chars, replace * with .*
// This handles patterns like "Exhibit Hall *", "Room *", "Mtg. Room *", etc.
var escapedPattern = Regex.Escape(normalizedPattern);
escapedPattern = escapedPattern.Replace(@"\*", ".*?");
// Use case-insensitive matching
var regex = new Regex($"^{escapedPattern}$", RegexOptions.IgnoreCase);
if (regex.IsMatch(normalizedLocation))
{
return normalizedLocation; // Return the full matched location
}
}
return string.Empty;
}
private TimeOnly ParseStartTime(string time)
{
int hour = 0;
int minute = 0;
// Handle TBD (To Be Determined) times gracefully
if (string.Equals(time.Trim(), "TBD", StringComparison.OrdinalIgnoreCase))
{
// Use a placeholder time (midnight) for TBD - the occurrence will still be created
// but with a time that indicates it's TBD
return new TimeOnly(0, 0, 0);
}
// get the part of the time before a timespan
if (time.Contains(" - "))
{
time = time[..time.IndexOf(" - ", StringComparison.Ordinal)];
}
if (time == "NOON")
hour = 12;
else
{
var timeMatch = _timeRe.Match(time.ToLower());
if (timeMatch.Success)
{
hour = int.Parse(timeMatch.Groups["Hour"].Captures[0].Value);
if (timeMatch.Groups["Minute"].Success)
{
minute = int.Parse(timeMatch.Groups["Minute"].Captures[0].Value);
}
if (timeMatch.Groups["APM"].Captures[0].Value is "p.m." or "pm" && hour < 12)
hour += 12;
}
else
{
throw new FormatException($"Time format not recognized: {time}");
}
}
return new TimeOnly(hour, minute, 0);
}
}