Files
chapter-organizer/Core/Parsers/EventOccurrenceParser.cs
T
poprhythm cf2c0d8068 Add footnotes support to event occurrence parsing
This commit introduces a new property for capturing footnotes in both the EventOccurrenceParseResult and EventOccurrenceParserResult classes. The EventOccurrenceParser has been updated to handle footnotes, which are identified by lines starting with "*" or as parenthetical notes. The logic for processing these footnotes has been integrated into the parsing flow, ensuring that they are correctly associated with their respective event definitions. Additionally, the EventOccurrenceParserService has been modified to copy footnotes from the parser result, enhancing the overall event parsing functionality.
2026-01-09 10:30:53 -05:00

393 lines
14 KiB
C#

using System.Text.RegularExpressions;
using Core.Entities;
using Core.Models;
using EventOccurrenceParsers = Core.Parsers.EventOccurrence;
using Core.Utility;
using SchoolLevel = Core.Models.SchoolLevel;
namespace Core.Parsers;
/// <summary>
/// Result of parsing event occurrence file, containing both occurrences and parsing issues.
/// </summary>
public class EventOccurrenceParserResult
{
public IDictionary<EventDefinition, List<Entities.EventOccurrence>> Occurrences { get; set; } = new Dictionary<EventDefinition, List<Entities.EventOccurrence>>();
public List<ParsingIssue> Issues { get; set; } = new();
public List<string> SkippedHSSectionHeaders { get; set; } = new();
public List<string> SkippedMSSectionHeaders { get; set; } = new();
public int SkippedMSEventCount { get; set; }
public int SkippedHSEventCount { get; set; }
/// <summary>
/// Footnotes captured for each event definition. Footnotes are lines that start with "*" or are parenthetical notes.
/// Multiple footnotes are concatenated into a single string.
/// </summary>
public IDictionary<EventDefinition, string> Footnotes { get; set; } = new Dictionary<EventDefinition, string>();
}
public class EventOccurrenceParser
{
private FileSystemInfo _txtFile;
private ICollection<EventDefinition> _events;
private SchoolLevel? _schoolLevel;
public EventOccurrenceParser(FileSystemInfo txtFile, ICollection<EventDefinition> events, SchoolLevel? schoolLevel = null)
{
_events = events;
_txtFile = txtFile;
_schoolLevel = schoolLevel;
}
public EventOccurrenceParserResult Parse()
{
var result = new EventOccurrenceParserResult();
var occurrences = result.Occurrences;
var issues = result.Issues;
var footnotes = result.Footnotes;
EventDefinition? currentEventDefinition = null;
bool inFootnoteMode = false;
SchoolLevel? currentSectionLevel = null;
var lines = File.ReadLines(_txtFile.FullName);
foreach (var (line, index) in lines.Select((line, index) => (line, index + 1)))
{
// Normalize input: trim and normalize hyphens (en-dash, em-dash -> regular hyphen)
// This allows the grammar parser to assume normalized input
var normalizedLine = TextUtil.SanitizeInput(line.Trim());
// Skip empty lines
if (EventOccurrenceParsers.LineClassifier.IsEmptyLine(normalizedLine))
{
// Empty lines break footnote mode
inFootnoteMode = false;
continue;
}
// Skip comment lines (starting with "#") - use grammar parser
if (EventOccurrenceParsers.LineClassifier.IsCommentLine(normalizedLine))
{
// Comment lines break footnote mode
inFootnoteMode = false;
continue;
}
// Try to parse occurrence line using grammar parser
var occurrenceLine = EventOccurrenceGrammar.TryParseOccurrenceLine(normalizedLine);
if (!occurrenceLine.HasValue)
{
// Not an occurrence line, try other line types
// Try to parse section header using grammar parser
var sectionHeader = EventOccurrenceGrammar.TryParseSectionHeader(normalizedLine);
if (sectionHeader.HasValue)
{
var (eventNamePart, schoolLevel) = sectionHeader.Value;
// Section headers break footnote mode
inFootnoteMode = false;
// Convert string school level to enum
var sectionSchoolLevel = SetCurrentSectionLevel(schoolLevel);
// Determine if we should skip this event based on chapter's school level setting
if (ShouldSkipSection(sectionSchoolLevel, normalizedLine, result))
{
currentEventDefinition = null; // Skip subsequent occurrences
currentSectionLevel = sectionSchoolLevel; // Track that we're in a skipped section
continue; // No issue created
}
// Set current section level for events we're processing
currentSectionLevel = sectionSchoolLevel;
// Use fuzzy matching to find the best matching event definition
var evt = EventOccurrenceParsers.SectionHeaderMatcher.MatchEventDefinition(eventNamePart, _events);
if (evt == null)
{
// For unmatched headers, create issue
var bestRatio = EventOccurrenceParsers.SectionHeaderMatcher.GetBestMatchRatio(eventNamePart, _events);
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = normalizedLine,
IssueType = ParsingIssueType.UnmatchedLine,
Message = $"Section header '{eventNamePart} - {schoolLevel}' found but no matching event definition (best match ratio: {bestRatio})"
});
continue;
}
currentEventDefinition = evt;
continue;
}
// Check for General Schedule/Session using grammar parser
if (EventOccurrenceParsers.SectionHeaderMatcher.IsGeneralSchedule(normalizedLine))
{
// General schedule breaks footnote mode
inFootnoteMode = false;
currentSectionLevel = null; // Reset section level
currentEventDefinition = EventDefinition.GeneralSchedule;
continue;
}
// Also check for simple "MS" or "HS" in line (backward compatibility)
if (EventOccurrenceParsers.SectionHeaderMatcher.HasSchoolLevel(normalizedLine))
{
// Section headers break footnote mode
inFootnoteMode = false;
// Extract school level from line
SchoolLevel? sectionSchoolLevel = null;
if (normalizedLine.Contains("MS", StringComparison.OrdinalIgnoreCase))
sectionSchoolLevel = SchoolLevel.MiddleSchool;
else if (normalizedLine.Contains("HS", StringComparison.OrdinalIgnoreCase))
sectionSchoolLevel = SchoolLevel.HighSchool;
// Determine if we should skip this event based on chapter's school level setting
if (ShouldSkipSection(sectionSchoolLevel, normalizedLine, result))
{
currentEventDefinition = null; // Skip subsequent occurrences
currentSectionLevel = sectionSchoolLevel; // Track that we're in a skipped section
continue; // No issue created
}
// Set current section level for events we're processing
currentSectionLevel = sectionSchoolLevel;
// Use fuzzy matching to find the best matching event definition
var evt = EventOccurrenceParsers.SectionHeaderMatcher.MatchEventDefinition(normalizedLine, _events);
if (evt == null)
{
// For unmatched headers, create issue
var bestRatio = EventOccurrenceParsers.SectionHeaderMatcher.GetBestMatchRatio(normalizedLine, _events);
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = normalizedLine,
IssueType = ParsingIssueType.UnmatchedLine,
Message = $"Section header with 'MS' or 'HS' found but no matching event definition (best match ratio: {bestRatio})"
});
continue;
}
currentEventDefinition = evt;
continue;
}
// Check if line starts with "*" to enter footnote mode
if (normalizedLine.TrimStart().StartsWith("*", StringComparison.Ordinal))
{
inFootnoteMode = true;
}
// Capture footnote lines (in footnote mode OR line starts with "*" or is parenthetical)
if (inFootnoteMode || EventOccurrenceParsers.LineClassifier.IsContinuationLine(normalizedLine))
{
// Capture footnote for current event definition if we have one
if (currentEventDefinition != null)
{
if (!footnotes.ContainsKey(currentEventDefinition))
{
footnotes[currentEventDefinition] = string.Empty;
}
// Append footnote, adding a space if there's already content
if (!string.IsNullOrEmpty(footnotes[currentEventDefinition]))
{
footnotes[currentEventDefinition] += " ";
}
footnotes[currentEventDefinition] += normalizedLine;
}
continue;
}
// "Voting Delegates" section header is no longer used - occurrences are categorized by name pattern
// Track as unmatched line if it's not empty
if (!string.IsNullOrWhiteSpace(normalizedLine))
{
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = normalizedLine,
IssueType = ParsingIssueType.UnmatchedLine,
Message = "Line does not match expected format (Name Month Day Time/Location)"
});
}
continue;
}
// Occurrence lines break footnote mode
inFootnoteMode = false;
// Skip occurrences under sections that don't match the school level setting
if (ShouldSkipOccurrence(currentSectionLevel, result))
{
continue;
}
var (occurrenceName, month, dayOfMonthStr, timeAndLocation) = occurrenceLine.Value;
// Remove weekday suffix from occurrence name if present
occurrenceName = Regex.Replace(occurrenceName,
@"(?<Weekday>Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s?$", "").Trim();
// Determine event definition based on occurrence name pattern or current section
EventDefinition? eventDefinition = EventOccurrenceParsers.EventDefinitionResolver.Resolve(occurrenceName, currentEventDefinition);
// Track issue if we can't determine the event definition
if (eventDefinition == null)
{
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = normalizedLine,
IssueType = ParsingIssueType.MissingEventDefinition,
Message = $"Cannot determine event definition for occurrence: {occurrenceName}"
});
continue;
}
// timeAndLocation is already normalized (hyphens normalized) since normalizedLine was sanitized
// Parse time and location - extract time using regex, then use everything after time as location
EventOccurrenceParsers.TimeLocationParser.Parse(timeAndLocation, out string time, out string location);
// Parse date
DateOnly? startDate = null;
try
{
startDate = TextUtil.ParseDate(month, dayOfMonthStr.ToString(), DateTime.Now.Year);
}
catch (Exception ex)
{
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = normalizedLine,
IssueType = ParsingIssueType.DateParseFailure,
Message = $"Failed to parse date: {ex.Message}"
});
continue;
}
// Parse time
TimeOnly? startTime = null;
try
{
startTime = EventOccurrenceParsers.TimeParser.Parse(time);
}
catch (Exception ex)
{
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = normalizedLine,
IssueType = ParsingIssueType.TimeParseFailure,
Message = $"Failed to parse time '{time}': {ex.Message}"
});
continue;
}
if (startDate == null || startTime == null)
continue;
var t = new DateTime(startDate.Value, startTime.Value);
var eventOccurrence = new Core.Entities.EventOccurrence
{
Name = occurrenceName,
StartTime = t,
Time = $"{time}",
Date = $"{month} {dayOfMonthStr}",
Location = location
};
if (!occurrences.ContainsKey(eventDefinition))
occurrences.Add(eventDefinition, []);
occurrences[eventDefinition].Add(eventOccurrence);
// Reset section level when we successfully parse an occurrence (means we're in a valid section)
currentSectionLevel = null;
}
return result;
}
/// <summary>
/// Determines if a section should be skipped based on chapter's school level setting.
/// If no school level is set, all events are processed (no filtering).
/// </summary>
/// <param name="sectionSchoolLevel">The school level of the section (null if no school level designation).</param>
/// <param name="normalizedLine">The normalized line content for tracking skipped headers.</param>
/// <param name="result">The parse result to update with skipped headers.</param>
/// <returns>True if the section should be skipped, false otherwise.</returns>
private bool ShouldSkipSection(SchoolLevel? sectionSchoolLevel, string normalizedLine, EventOccurrenceParserResult result)
{
if (!sectionSchoolLevel.HasValue)
return false; // Events without school level are never skipped
// If no school level is set, process all events (no filtering)
if (!_schoolLevel.HasValue)
return false;
// School level is set - filter based on it
if (_schoolLevel.Value == SchoolLevel.MiddleSchool && sectionSchoolLevel.Value == SchoolLevel.HighSchool)
{
result.SkippedHSSectionHeaders.Add(normalizedLine);
return true;
}
if (_schoolLevel.Value == SchoolLevel.HighSchool && sectionSchoolLevel.Value == SchoolLevel.MiddleSchool)
{
result.SkippedMSSectionHeaders.Add(normalizedLine);
return true;
}
return false;
}
/// <summary>
/// Converts string school level ("MS", "HS", or empty) to SchoolLevel?.
/// </summary>
/// <param name="schoolLevelStr">The school level string from the section header.</param>
/// <returns>SchoolLevel? representing the school level, or null if no school level designation.</returns>
private static SchoolLevel? SetCurrentSectionLevel(string schoolLevelStr)
{
if (string.IsNullOrWhiteSpace(schoolLevelStr))
return null;
if (schoolLevelStr.Equals("MS", StringComparison.OrdinalIgnoreCase))
return SchoolLevel.MiddleSchool;
if (schoolLevelStr.Equals("HS", StringComparison.OrdinalIgnoreCase))
return SchoolLevel.HighSchool;
return null;
}
/// <summary>
/// Checks if current occurrence should be skipped based on section level.
/// If no school level is set, all events are processed (no filtering).
/// </summary>
/// <param name="currentSectionLevel">The current section level.</param>
/// <param name="result">The parse result to update with skip counts.</param>
/// <returns>True if the occurrence should be skipped, false otherwise.</returns>
private bool ShouldSkipOccurrence(SchoolLevel? currentSectionLevel, EventOccurrenceParserResult result)
{
if (!currentSectionLevel.HasValue)
return false; // Events without school level are never skipped
// If no school level is set, process all events (no filtering)
if (!_schoolLevel.HasValue)
return false;
// School level is set - filter based on it
if (_schoolLevel.Value == SchoolLevel.MiddleSchool && currentSectionLevel.Value == SchoolLevel.HighSchool)
{
result.SkippedHSEventCount++;
return true;
}
if (_schoolLevel.Value == SchoolLevel.HighSchool && currentSectionLevel.Value == SchoolLevel.MiddleSchool)
{
result.SkippedMSEventCount++;
return true;
}
return false;
}
}