Files
chapter-organizer/Core/Parsers/EventOccurrenceParser.cs
T
poprhythm 19e5ef0675 Enhance event occurrence parsing to skip unmatched high school section headers
This commit introduces a new property to track skipped high school section headers in the EventOccurrenceParseResult and EventOccurrenceParserResult classes. The EventOccurrenceParser has been updated to gracefully skip HS section headers that do not match any event definitions, improving the parsing logic. Additionally, the LocationParsingConfiguration has been removed from the EventOccurrenceParser, simplifying its constructor. Unit tests have been updated to reflect these changes and ensure correct behavior during parsing.
2026-01-09 00:14:19 -05:00

272 lines
9.3 KiB
C#

using System.Text.RegularExpressions;
using Core.Entities;
using Core.Models;
using EventOccurrenceParsers = Core.Parsers.EventOccurrence;
using Core.Utility;
namespace Core.Parsers;
/// <summary>
/// Result of parsing event occurrence file, containing both occurrences and parsing issues.
/// </summary>
public class EventOccurrenceParserResult
{
public IDictionary<EventDefinition, List<Entities.EventOccurrence>> Occurrences { get; set; } = new Dictionary<EventDefinition, List<Entities.EventOccurrence>>();
public List<ParsingIssue> Issues { get; set; } = new();
public List<string> SkippedHSSectionHeaders { get; set; } = new();
}
public class EventOccurrenceParser
{
private FileSystemInfo _txtFile;
private ICollection<EventDefinition> _events;
public EventOccurrenceParser(FileSystemInfo txtFile, ICollection<EventDefinition> events)
{
_events = events;
_txtFile = txtFile;
}
public EventOccurrenceParserResult Parse()
{
var result = new EventOccurrenceParserResult();
var occurrences = result.Occurrences;
var issues = result.Issues;
EventDefinition? currentEventDefinition = null;
bool inContinuationMode = false;
bool inHSSection = false;
var lines = File.ReadLines(_txtFile.FullName);
foreach (var (line, index) in lines.Select((line, index) => (line, index + 1)))
{
// Normalize input: trim and normalize hyphens (en-dash, em-dash -> regular hyphen)
// This allows the grammar parser to assume normalized input
var normalizedLine = TextUtil.SanitizeInput(line.Trim());
// Skip empty lines
if (EventOccurrenceParsers.LineClassifier.IsEmptyLine(normalizedLine))
{
// Empty lines break continuation mode
inContinuationMode = false;
continue;
}
// Skip comment lines (starting with "#") - use grammar parser
if (EventOccurrenceParsers.LineClassifier.IsCommentLine(normalizedLine))
{
// Comment lines break continuation mode
inContinuationMode = false;
continue;
}
// Try to parse occurrence line using grammar parser
var occurrenceLine = EventOccurrenceGrammar.TryParseOccurrenceLine(normalizedLine);
if (!occurrenceLine.HasValue)
{
// Not an occurrence line, try other line types
// Try to parse section header using grammar parser
var sectionHeader = EventOccurrenceGrammar.TryParseSectionHeader(normalizedLine);
if (sectionHeader.HasValue)
{
var (eventNamePart, schoolLevel) = sectionHeader.Value;
// Section headers break continuation mode
inContinuationMode = false;
// Use fuzzy matching to find the best matching event definition
var evt = EventOccurrenceParsers.SectionHeaderMatcher.MatchEventDefinition(eventNamePart, _events);
if (evt == null)
{
// Check if this is an HS event - if so, skip gracefully
if (schoolLevel.Equals("HS", StringComparison.OrdinalIgnoreCase))
{
result.SkippedHSSectionHeaders.Add(normalizedLine);
currentEventDefinition = null; // Skip subsequent occurrences
inHSSection = true; // Mark that we're in an HS section
continue; // No issue created
}
// For non-HS unmatched headers, create issue as before
var bestRatio = EventOccurrenceParsers.SectionHeaderMatcher.GetBestMatchRatio(eventNamePart, _events);
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = normalizedLine,
IssueType = ParsingIssueType.UnmatchedLine,
Message = $"Section header '{eventNamePart} - {schoolLevel}' found but no matching event definition (best match ratio: {bestRatio})"
});
continue;
}
currentEventDefinition = evt;
inHSSection = false; // Reset HS section flag for MS events
continue;
}
// Check for General Schedule/Session using grammar parser
if (EventOccurrenceParsers.SectionHeaderMatcher.IsGeneralSchedule(normalizedLine))
{
// General schedule breaks continuation mode
inContinuationMode = false;
inHSSection = false; // Reset HS section flag
currentEventDefinition = EventDefinition.GeneralSchedule;
continue;
}
// Also check for simple "MS" or "HS" in line (backward compatibility)
if (EventOccurrenceParsers.SectionHeaderMatcher.HasSchoolLevel(normalizedLine))
{
// Section headers break continuation mode
inContinuationMode = false;
var evt = EventOccurrenceParsers.SectionHeaderMatcher.MatchEventDefinition(normalizedLine, _events);
if (evt == null)
{
// Check if this is an HS event - if so, skip gracefully
if (normalizedLine.Contains("HS", StringComparison.OrdinalIgnoreCase))
{
result.SkippedHSSectionHeaders.Add(normalizedLine);
currentEventDefinition = null; // Skip subsequent occurrences
inHSSection = true; // Mark that we're in an HS section
continue; // No issue created
}
// For non-HS unmatched headers, create issue as before
var bestRatio = EventOccurrenceParsers.SectionHeaderMatcher.GetBestMatchRatio(normalizedLine, _events);
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = normalizedLine,
IssueType = ParsingIssueType.UnmatchedLine,
Message = $"Section header with 'MS' or 'HS' found but no matching event definition (best match ratio: {bestRatio})"
});
continue;
}
currentEventDefinition = evt;
inHSSection = false; // Reset HS section flag for MS events
continue;
}
// Check if line starts with "*" to enter continuation mode
if (normalizedLine.TrimStart().StartsWith("*", StringComparison.Ordinal))
{
inContinuationMode = true;
}
// Skip continuation lines (in continuation mode OR line starts with "*" or is parenthetical)
if (inContinuationMode || EventOccurrenceParsers.LineClassifier.IsContinuationLine(normalizedLine))
{
continue;
}
// "Voting Delegates" section header is no longer used - occurrences are categorized by name pattern
// Track as unmatched line if it's not empty
if (!string.IsNullOrWhiteSpace(normalizedLine))
{
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = normalizedLine,
IssueType = ParsingIssueType.UnmatchedLine,
Message = "Line does not match expected format (Name Month Day Time/Location)"
});
}
continue;
}
// Occurrence lines break continuation mode
inContinuationMode = false;
// Skip occurrences under HS sections (they won't match any event definition)
if (inHSSection)
{
continue;
}
var (occurrenceName, month, dayOfMonthStr, timeAndLocation) = occurrenceLine.Value;
// Remove weekday suffix from occurrence name if present
occurrenceName = Regex.Replace(occurrenceName,
@"(?<Weekday>Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s?$", "").Trim();
// Determine event definition based on occurrence name pattern or current section
EventDefinition? eventDefinition = EventOccurrenceParsers.EventDefinitionResolver.Resolve(occurrenceName, currentEventDefinition);
// Track issue if we can't determine the event definition
if (eventDefinition == null)
{
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = normalizedLine,
IssueType = ParsingIssueType.MissingEventDefinition,
Message = $"Cannot determine event definition for occurrence: {occurrenceName}"
});
continue;
}
// timeAndLocation is already normalized (hyphens normalized) since normalizedLine was sanitized
// Parse time and location - extract time using regex, then use everything after time as location
EventOccurrenceParsers.TimeLocationParser.Parse(timeAndLocation, out string time, out string location);
// Parse date
DateOnly? startDate = null;
try
{
startDate = TextUtil.ParseDate(month, dayOfMonthStr.ToString(), DateTime.Now.Year);
}
catch (Exception ex)
{
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = normalizedLine,
IssueType = ParsingIssueType.DateParseFailure,
Message = $"Failed to parse date: {ex.Message}"
});
continue;
}
// Parse time
TimeOnly? startTime = null;
try
{
startTime = EventOccurrenceParsers.TimeParser.Parse(time);
}
catch (Exception ex)
{
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = normalizedLine,
IssueType = ParsingIssueType.TimeParseFailure,
Message = $"Failed to parse time '{time}': {ex.Message}"
});
continue;
}
if (startDate == null || startTime == null)
continue;
var t = new DateTime(startDate.Value, startTime.Value);
var eventOccurrence = new Core.Entities.EventOccurrence
{
Name = occurrenceName,
StartTime = t,
Time = $"{time}",
Date = $"{month} {dayOfMonthStr}",
Location = location
};
if (!occurrences.ContainsKey(eventDefinition))
occurrences.Add(eventDefinition, []);
occurrences[eventDefinition].Add(eventOccurrence);
// Reset HS section flag when we successfully parse an occurrence (means we're in a valid section)
inHSSection = false;
}
return result;
}
}