2eae3f205c
This commit updates the EventOccurrenceParser to ensure that high school (HS) events are not incorrectly associated with middle school (MS) events during parsing. The logic now gracefully skips HS section headers, preventing any fuzzy matching from leading to incorrect associations. Additionally, a new unit test has been added to verify that HS occurrences are correctly excluded from MS event occurrences, ensuring the integrity of the parsing process.
276 lines
9.7 KiB
C#
276 lines
9.7 KiB
C#
using System.Text.RegularExpressions;
|
||
using Core.Entities;
|
||
using Core.Models;
|
||
using EventOccurrenceParsers = Core.Parsers.EventOccurrence;
|
||
using Core.Utility;
|
||
|
||
namespace Core.Parsers;
|
||
|
||
/// <summary>
|
||
/// Result of parsing event occurrence file, containing both occurrences and parsing issues.
|
||
/// </summary>
|
||
public class EventOccurrenceParserResult
|
||
{
|
||
public IDictionary<EventDefinition, List<Entities.EventOccurrence>> Occurrences { get; set; } = new Dictionary<EventDefinition, List<Entities.EventOccurrence>>();
|
||
public List<ParsingIssue> Issues { get; set; } = new();
|
||
public List<string> SkippedHSSectionHeaders { get; set; } = new();
|
||
}
|
||
|
||
public class EventOccurrenceParser
|
||
{
|
||
private FileSystemInfo _txtFile;
|
||
private ICollection<EventDefinition> _events;
|
||
|
||
public EventOccurrenceParser(FileSystemInfo txtFile, ICollection<EventDefinition> events)
|
||
{
|
||
_events = events;
|
||
_txtFile = txtFile;
|
||
}
|
||
|
||
public EventOccurrenceParserResult Parse()
|
||
{
|
||
var result = new EventOccurrenceParserResult();
|
||
var occurrences = result.Occurrences;
|
||
var issues = result.Issues;
|
||
EventDefinition? currentEventDefinition = null;
|
||
bool inContinuationMode = false;
|
||
bool inHSSection = false;
|
||
|
||
var lines = File.ReadLines(_txtFile.FullName);
|
||
foreach (var (line, index) in lines.Select((line, index) => (line, index + 1)))
|
||
{
|
||
// Normalize input: trim and normalize hyphens (en-dash, em-dash -> regular hyphen)
|
||
// This allows the grammar parser to assume normalized input
|
||
var normalizedLine = TextUtil.SanitizeInput(line.Trim());
|
||
|
||
// Skip empty lines
|
||
if (EventOccurrenceParsers.LineClassifier.IsEmptyLine(normalizedLine))
|
||
{
|
||
// Empty lines break continuation mode
|
||
inContinuationMode = false;
|
||
continue;
|
||
}
|
||
|
||
// Skip comment lines (starting with "#") - use grammar parser
|
||
if (EventOccurrenceParsers.LineClassifier.IsCommentLine(normalizedLine))
|
||
{
|
||
// Comment lines break continuation mode
|
||
inContinuationMode = false;
|
||
continue;
|
||
}
|
||
|
||
// Try to parse occurrence line using grammar parser
|
||
var occurrenceLine = EventOccurrenceGrammar.TryParseOccurrenceLine(normalizedLine);
|
||
if (!occurrenceLine.HasValue)
|
||
{
|
||
// Not an occurrence line, try other line types
|
||
// Try to parse section header using grammar parser
|
||
var sectionHeader = EventOccurrenceGrammar.TryParseSectionHeader(normalizedLine);
|
||
if (sectionHeader.HasValue)
|
||
{
|
||
var (eventNamePart, schoolLevel) = sectionHeader.Value;
|
||
|
||
// Section headers break continuation mode
|
||
inContinuationMode = false;
|
||
|
||
// Check if this is an HS event - if so, skip gracefully regardless of whether it matches
|
||
// This prevents HS events from being incorrectly associated with MS events (e.g.,
|
||
// "Biotechnology Design – HS" matching "Biotechnology" MS event)
|
||
if (schoolLevel.Equals("HS", StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
result.SkippedHSSectionHeaders.Add(normalizedLine);
|
||
currentEventDefinition = null; // Skip subsequent occurrences
|
||
inHSSection = true; // Mark that we're in an HS section
|
||
continue; // No issue created
|
||
}
|
||
|
||
// For MS events, use fuzzy matching to find the best matching event definition
|
||
var evt = EventOccurrenceParsers.SectionHeaderMatcher.MatchEventDefinition(eventNamePart, _events);
|
||
if (evt == null)
|
||
{
|
||
// For unmatched MS headers, create issue
|
||
var bestRatio = EventOccurrenceParsers.SectionHeaderMatcher.GetBestMatchRatio(eventNamePart, _events);
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = index,
|
||
LineContent = normalizedLine,
|
||
IssueType = ParsingIssueType.UnmatchedLine,
|
||
Message = $"Section header '{eventNamePart} - {schoolLevel}' found but no matching event definition (best match ratio: {bestRatio})"
|
||
});
|
||
continue;
|
||
}
|
||
currentEventDefinition = evt;
|
||
inHSSection = false; // Reset HS section flag for MS events
|
||
continue;
|
||
}
|
||
|
||
// Check for General Schedule/Session using grammar parser
|
||
if (EventOccurrenceParsers.SectionHeaderMatcher.IsGeneralSchedule(normalizedLine))
|
||
{
|
||
// General schedule breaks continuation mode
|
||
inContinuationMode = false;
|
||
inHSSection = false; // Reset HS section flag
|
||
currentEventDefinition = EventDefinition.GeneralSchedule;
|
||
continue;
|
||
}
|
||
|
||
// Also check for simple "MS" or "HS" in line (backward compatibility)
|
||
if (EventOccurrenceParsers.SectionHeaderMatcher.HasSchoolLevel(normalizedLine))
|
||
{
|
||
// Section headers break continuation mode
|
||
inContinuationMode = false;
|
||
|
||
// Check if this is an HS event - if so, skip gracefully regardless of whether it matches
|
||
// This prevents HS events from being incorrectly associated with MS events
|
||
if (normalizedLine.Contains("HS", StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
result.SkippedHSSectionHeaders.Add(normalizedLine);
|
||
currentEventDefinition = null; // Skip subsequent occurrences
|
||
inHSSection = true; // Mark that we're in an HS section
|
||
continue; // No issue created
|
||
}
|
||
|
||
// For MS events, use fuzzy matching to find the best matching event definition
|
||
var evt = EventOccurrenceParsers.SectionHeaderMatcher.MatchEventDefinition(normalizedLine, _events);
|
||
if (evt == null)
|
||
{
|
||
// For unmatched MS headers, create issue
|
||
var bestRatio = EventOccurrenceParsers.SectionHeaderMatcher.GetBestMatchRatio(normalizedLine, _events);
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = index,
|
||
LineContent = normalizedLine,
|
||
IssueType = ParsingIssueType.UnmatchedLine,
|
||
Message = $"Section header with 'MS' or 'HS' found but no matching event definition (best match ratio: {bestRatio})"
|
||
});
|
||
continue;
|
||
}
|
||
currentEventDefinition = evt;
|
||
inHSSection = false; // Reset HS section flag for MS events
|
||
continue;
|
||
}
|
||
|
||
// Check if line starts with "*" to enter continuation mode
|
||
if (normalizedLine.TrimStart().StartsWith("*", StringComparison.Ordinal))
|
||
{
|
||
inContinuationMode = true;
|
||
}
|
||
|
||
// Skip continuation lines (in continuation mode OR line starts with "*" or is parenthetical)
|
||
if (inContinuationMode || EventOccurrenceParsers.LineClassifier.IsContinuationLine(normalizedLine))
|
||
{
|
||
continue;
|
||
}
|
||
|
||
// "Voting Delegates" section header is no longer used - occurrences are categorized by name pattern
|
||
// Track as unmatched line if it's not empty
|
||
if (!string.IsNullOrWhiteSpace(normalizedLine))
|
||
{
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = index,
|
||
LineContent = normalizedLine,
|
||
IssueType = ParsingIssueType.UnmatchedLine,
|
||
Message = "Line does not match expected format (Name Month Day Time/Location)"
|
||
});
|
||
}
|
||
continue;
|
||
}
|
||
|
||
// Occurrence lines break continuation mode
|
||
inContinuationMode = false;
|
||
|
||
// Skip occurrences under HS sections (they won't match any event definition)
|
||
if (inHSSection)
|
||
{
|
||
continue;
|
||
}
|
||
|
||
var (occurrenceName, month, dayOfMonthStr, timeAndLocation) = occurrenceLine.Value;
|
||
|
||
// Remove weekday suffix from occurrence name if present
|
||
occurrenceName = Regex.Replace(occurrenceName,
|
||
@"(?<Weekday>Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s?$", "").Trim();
|
||
|
||
// Determine event definition based on occurrence name pattern or current section
|
||
EventDefinition? eventDefinition = EventOccurrenceParsers.EventDefinitionResolver.Resolve(occurrenceName, currentEventDefinition);
|
||
|
||
// Track issue if we can't determine the event definition
|
||
if (eventDefinition == null)
|
||
{
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = index,
|
||
LineContent = normalizedLine,
|
||
IssueType = ParsingIssueType.MissingEventDefinition,
|
||
Message = $"Cannot determine event definition for occurrence: {occurrenceName}"
|
||
});
|
||
continue;
|
||
}
|
||
|
||
// timeAndLocation is already normalized (hyphens normalized) since normalizedLine was sanitized
|
||
|
||
// Parse time and location - extract time using regex, then use everything after time as location
|
||
EventOccurrenceParsers.TimeLocationParser.Parse(timeAndLocation, out string time, out string location);
|
||
|
||
// Parse date
|
||
DateOnly? startDate = null;
|
||
try
|
||
{
|
||
startDate = TextUtil.ParseDate(month, dayOfMonthStr.ToString(), DateTime.Now.Year);
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = index,
|
||
LineContent = normalizedLine,
|
||
IssueType = ParsingIssueType.DateParseFailure,
|
||
Message = $"Failed to parse date: {ex.Message}"
|
||
});
|
||
continue;
|
||
}
|
||
|
||
// Parse time
|
||
TimeOnly? startTime = null;
|
||
try
|
||
{
|
||
startTime = EventOccurrenceParsers.TimeParser.Parse(time);
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
issues.Add(new ParsingIssue
|
||
{
|
||
LineNumber = index,
|
||
LineContent = normalizedLine,
|
||
IssueType = ParsingIssueType.TimeParseFailure,
|
||
Message = $"Failed to parse time '{time}': {ex.Message}"
|
||
});
|
||
continue;
|
||
}
|
||
|
||
if (startDate == null || startTime == null)
|
||
continue;
|
||
|
||
var t = new DateTime(startDate.Value, startTime.Value);
|
||
|
||
var eventOccurrence = new Core.Entities.EventOccurrence
|
||
{
|
||
Name = occurrenceName,
|
||
StartTime = t,
|
||
Time = $"{time}",
|
||
Date = $"{month} {dayOfMonthStr}",
|
||
Location = location
|
||
};
|
||
|
||
if (!occurrences.ContainsKey(eventDefinition))
|
||
occurrences.Add(eventDefinition, []);
|
||
occurrences[eventDefinition].Add(eventOccurrence);
|
||
|
||
// Reset HS section flag when we successfully parse an occurrence (means we're in a valid section)
|
||
inHSSection = false;
|
||
}
|
||
|
||
return result;
|
||
}
|
||
} |