Enhance event occurrence parsing to skip unmatched high school section headers

This commit introduces a new property to track skipped high school section headers in the EventOccurrenceParseResult and EventOccurrenceParserResult classes. The EventOccurrenceParser has been updated to gracefully skip HS section headers that do not match any event definitions, improving the parsing logic. Additionally, the LocationParsingConfiguration has been removed from the EventOccurrenceParser, simplifying its constructor. Unit tests have been updated to reflect these changes and ensure correct behavior during parsing.
This commit is contained in:
2026-01-09 00:14:19 -05:00
parent f916cfad6b
commit 19e5ef0675
10 changed files with 279 additions and 372 deletions
+7 -6
View File
@@ -30,6 +30,12 @@ public class EventOccurrenceParseResult
/// </summary>
public List<ParsingIssue> Issues { get; set; } = new();
/// <summary>
/// List of high school (HS) section headers that were encountered but skipped
/// because they don't match any event definition in the system.
/// </summary>
public List<string> SkippedHSSectionHeaders { get; set; } = new();
/// <summary>
/// Total number of event occurrences successfully parsed.
/// </summary>
@@ -110,11 +116,6 @@ public enum ParsingIssueType
/// <summary>
/// Invalid format or other parsing issue.
/// </summary>
InvalidFormat,
/// <summary>
/// Location parsing failed (no matching pattern found).
/// </summary>
LocationParseFailure
InvalidFormat
}
+5 -19
View File
@@ -24,33 +24,19 @@ public static class LineClassifier
/// <summary>
/// Determines if a line is a continuation/wrapped line that should be skipped.
/// These are typically lines that:
/// - Start with lowercase or special characters (not event names)
/// - Start with "*" (marks the start of a continuation block)
/// - Are parenthetical notes like "(Semifinalists only)"
/// - Are informational text like "Schedule Posted on..."
/// </summary>
public static bool IsContinuationLine(string line)
{
var trimmed = line.Trim();
// Skip parenthetical notes
if (trimmed.StartsWith("(", StringComparison.Ordinal) && trimmed.EndsWith(")", StringComparison.Ordinal))
// Check if line starts with "*" (marks continuation block start)
if (trimmed.StartsWith("*", StringComparison.Ordinal))
return true;
// Skip lines that are clearly continuation text (start with lowercase, common continuation words)
if (trimmed.Length > 0 && char.IsLower(trimmed[0]))
{
// Check if it starts with common continuation words
var continuationPrefixes = new[] { "be ", "the ", "and ", "or ", "to ", "a ", "an ", "will ", "may ", "can " };
foreach (var prefix in continuationPrefixes)
{
if (trimmed.StartsWith(prefix, StringComparison.OrdinalIgnoreCase))
return true;
}
}
// Skip informational lines that don't contain dates/times
if (trimmed.Contains("Schedule Posted", StringComparison.OrdinalIgnoreCase) ||
trimmed.Contains("Note:", StringComparison.OrdinalIgnoreCase))
// Skip parenthetical notes
if (trimmed.StartsWith("(", StringComparison.Ordinal) && trimmed.EndsWith(")", StringComparison.Ordinal))
return true;
return false;
@@ -1,11 +1,10 @@
using System.Text.RegularExpressions;
using Core.Models;
namespace Core.Parsers.EventOccurrence;
/// <summary>
/// Parses time and location from combined strings.
/// Handles time ranges, location extraction, and pattern matching.
/// Extracts time using regex, then uses everything after the time as the location.
/// </summary>
public static class TimeLocationParser
{
@@ -28,21 +27,18 @@ public static class TimeLocationParser
RegexOptions.Compiled | RegexOptions.IgnoreCase);
/// <summary>
/// Parses time and location from the timeAndLocation string using configurable location patterns.
/// Parses time and location from the timeAndLocation string.
/// Extracts time using regex, then uses everything after the time as the location (after cleaning time fragments).
/// </summary>
/// <param name="timeAndLocation">The combined time and location string.</param>
/// <param name="locationConfig">The location parsing configuration with patterns.</param>
/// <param name="time">Output parameter: the parsed time string.</param>
/// <param name="location">Output parameter: the parsed location string.</param>
/// <param name="locationParseSuccess">Output parameter: whether location parsing was successful.</param>
public static void Parse(
string timeAndLocation,
LocationParsingConfiguration? locationConfig,
out string time,
out string location,
out bool locationParseSuccess)
out string location)
{
// Try to separate time from location using the time regex
// Extract time using regex
var timeLocationMatch = TimeLocationRegex.Match(timeAndLocation);
if (!timeLocationMatch.Success)
@@ -50,7 +46,6 @@ public static class TimeLocationParser
// If time regex doesn't match, use the whole string as time
time = timeAndLocation.Trim();
location = string.Empty;
locationParseSuccess = false;
return;
}
@@ -63,61 +58,12 @@ public static class TimeLocationParser
if (string.IsNullOrWhiteSpace(locationPart))
{
location = string.Empty;
locationParseSuccess = true; // Consider it a success since no location is needed
return;
}
// Clean up location part - remove any remaining time components
// Clean location of any remaining time fragments
// (e.g., " 12:15 p.m. Exhibit Hall C" -> "Exhibit Hall C")
locationPart = CleanLocationText(locationPart);
if (string.IsNullOrWhiteSpace(locationPart))
{
location = string.Empty;
locationParseSuccess = true; // No location after cleaning is also valid
return;
}
// Try to match location using configurable patterns
(location, locationParseSuccess) = TryMatchLocation(locationPart, locationConfig);
// If no pattern matched but we have a location, use it anyway
// This allows parsing to continue while still tracking that the location didn't match a pattern
if (!locationParseSuccess)
{
location = locationPart;
}
}
/// <summary>
/// Attempts to match a location string against configured patterns.
/// </summary>
private static (string location, bool success) TryMatchLocation(
string locationPart,
LocationParsingConfiguration? locationConfig)
{
// No patterns configured - can't match
if (locationConfig == null || !locationConfig.LocationPatterns.Any())
{
return (string.Empty, false);
}
// Try initial match
var location = LocationPatternMatcher.Match(locationPart, locationConfig.LocationPatterns);
if (!string.IsNullOrEmpty(location))
{
return (location, true);
}
// Try matching against trimmed version (handles extra whitespace)
var cleanedForMatching = locationPart.Trim();
location = LocationPatternMatcher.Match(cleanedForMatching, locationConfig.LocationPatterns);
if (!string.IsNullOrEmpty(location))
{
return (cleanedForMatching, true);
}
return (string.Empty, false);
location = CleanLocationText(locationPart);
}
/// <summary>
+65 -22
View File
@@ -13,19 +13,18 @@ public class EventOccurrenceParserResult
{
public IDictionary<EventDefinition, List<Entities.EventOccurrence>> Occurrences { get; set; } = new Dictionary<EventDefinition, List<Entities.EventOccurrence>>();
public List<ParsingIssue> Issues { get; set; } = new();
public List<string> SkippedHSSectionHeaders { get; set; } = new();
}
public class EventOccurrenceParser
{
private FileSystemInfo _txtFile;
private ICollection<EventDefinition> _events;
private LocationParsingConfiguration? _locationConfig;
public EventOccurrenceParser(FileSystemInfo txtFile, ICollection<EventDefinition> events, LocationParsingConfiguration? locationConfig = null)
public EventOccurrenceParser(FileSystemInfo txtFile, ICollection<EventDefinition> events)
{
_events = events;
_txtFile = txtFile;
_locationConfig = locationConfig;
}
public EventOccurrenceParserResult Parse()
@@ -34,6 +33,8 @@ public class EventOccurrenceParser
var occurrences = result.Occurrences;
var issues = result.Issues;
EventDefinition? currentEventDefinition = null;
bool inContinuationMode = false;
bool inHSSection = false;
var lines = File.ReadLines(_txtFile.FullName);
foreach (var (line, index) in lines.Select((line, index) => (line, index + 1)))
@@ -44,11 +45,19 @@ public class EventOccurrenceParser
// Skip empty lines
if (EventOccurrenceParsers.LineClassifier.IsEmptyLine(normalizedLine))
{
// Empty lines break continuation mode
inContinuationMode = false;
continue;
}
// Skip comment lines (starting with "#") - use grammar parser
if (EventOccurrenceParsers.LineClassifier.IsCommentLine(normalizedLine))
{
// Comment lines break continuation mode
inContinuationMode = false;
continue;
}
// Try to parse occurrence line using grammar parser
var occurrenceLine = EventOccurrenceGrammar.TryParseOccurrenceLine(normalizedLine);
@@ -61,10 +70,23 @@ public class EventOccurrenceParser
{
var (eventNamePart, schoolLevel) = sectionHeader.Value;
// Section headers break continuation mode
inContinuationMode = false;
// Use fuzzy matching to find the best matching event definition
var evt = EventOccurrenceParsers.SectionHeaderMatcher.MatchEventDefinition(eventNamePart, _events);
if (evt == null)
{
// Check if this is an HS event - if so, skip gracefully
if (schoolLevel.Equals("HS", StringComparison.OrdinalIgnoreCase))
{
result.SkippedHSSectionHeaders.Add(normalizedLine);
currentEventDefinition = null; // Skip subsequent occurrences
inHSSection = true; // Mark that we're in an HS section
continue; // No issue created
}
// For non-HS unmatched headers, create issue as before
var bestRatio = EventOccurrenceParsers.SectionHeaderMatcher.GetBestMatchRatio(eventNamePart, _events);
issues.Add(new ParsingIssue
{
@@ -76,12 +98,16 @@ public class EventOccurrenceParser
continue;
}
currentEventDefinition = evt;
inHSSection = false; // Reset HS section flag for MS events
continue;
}
// Check for General Schedule/Session using grammar parser
if (EventOccurrenceParsers.SectionHeaderMatcher.IsGeneralSchedule(normalizedLine))
{
// General schedule breaks continuation mode
inContinuationMode = false;
inHSSection = false; // Reset HS section flag
currentEventDefinition = EventDefinition.GeneralSchedule;
continue;
}
@@ -89,9 +115,22 @@ public class EventOccurrenceParser
// Also check for simple "MS" or "HS" in line (backward compatibility)
if (EventOccurrenceParsers.SectionHeaderMatcher.HasSchoolLevel(normalizedLine))
{
// Section headers break continuation mode
inContinuationMode = false;
var evt = EventOccurrenceParsers.SectionHeaderMatcher.MatchEventDefinition(normalizedLine, _events);
if (evt == null)
{
// Check if this is an HS event - if so, skip gracefully
if (normalizedLine.Contains("HS", StringComparison.OrdinalIgnoreCase))
{
result.SkippedHSSectionHeaders.Add(normalizedLine);
currentEventDefinition = null; // Skip subsequent occurrences
inHSSection = true; // Mark that we're in an HS section
continue; // No issue created
}
// For non-HS unmatched headers, create issue as before
var bestRatio = EventOccurrenceParsers.SectionHeaderMatcher.GetBestMatchRatio(normalizedLine, _events);
issues.Add(new ParsingIssue
{
@@ -103,11 +142,18 @@ public class EventOccurrenceParser
continue;
}
currentEventDefinition = evt;
inHSSection = false; // Reset HS section flag for MS events
continue;
}
// Skip continuation lines (lines that look like they're continuing from previous line)
if (EventOccurrenceParsers.LineClassifier.IsContinuationLine(normalizedLine))
// Check if line starts with "*" to enter continuation mode
if (normalizedLine.TrimStart().StartsWith("*", StringComparison.Ordinal))
{
inContinuationMode = true;
}
// Skip continuation lines (in continuation mode OR line starts with "*" or is parenthetical)
if (inContinuationMode || EventOccurrenceParsers.LineClassifier.IsContinuationLine(normalizedLine))
{
continue;
}
@@ -127,6 +173,15 @@ public class EventOccurrenceParser
continue;
}
// Occurrence lines break continuation mode
inContinuationMode = false;
// Skip occurrences under HS sections (they won't match any event definition)
if (inHSSection)
{
continue;
}
var (occurrenceName, month, dayOfMonthStr, timeAndLocation) = occurrenceLine.Value;
// Remove weekday suffix from occurrence name if present
@@ -151,23 +206,8 @@ public class EventOccurrenceParser
// timeAndLocation is already normalized (hyphens normalized) since normalizedLine was sanitized
// Parse time and location using configurable patterns
EventOccurrenceParsers.TimeLocationParser.Parse(timeAndLocation, _locationConfig, out string time, out string location, out bool locationParseSuccess);
// Track location parsing failure if patterns are configured but none matched
if (!locationParseSuccess && !string.IsNullOrWhiteSpace(location))
{
if (_locationConfig != null && _locationConfig.LocationPatterns.Any())
{
issues.Add(new ParsingIssue
{
LineNumber = index,
LineContent = normalizedLine,
IssueType = ParsingIssueType.LocationParseFailure,
Message = $"Location '{location}' does not match any configured pattern"
});
}
}
// Parse time and location - extract time using regex, then use everything after time as location
EventOccurrenceParsers.TimeLocationParser.Parse(timeAndLocation, out string time, out string location);
// Parse date
DateOnly? startDate = null;
@@ -222,6 +262,9 @@ public class EventOccurrenceParser
if (!occurrences.ContainsKey(eventDefinition))
occurrences.Add(eventDefinition, []);
occurrences[eventDefinition].Add(eventOccurrence);
// Reset HS section flag when we successfully parse an occurrence (means we're in a valid section)
inHSSection = false;
}
return result;
+6 -14
View File
@@ -12,20 +12,9 @@ namespace Core.Services;
/// </summary>
public class EventOccurrenceParserService : IEventOccurrenceParserService
{
private readonly LocationParsingConfiguration? _locationConfig;
public EventOccurrenceParserService(IConfiguration? configuration = null)
{
// Load location parsing configuration from IConfiguration if provided
if (configuration != null)
{
_locationConfig = configuration.GetSection("LocationParsingSettings").Get<LocationParsingConfiguration>()
?? LocationParsingConfiguration.Default;
}
else
{
_locationConfig = LocationParsingConfiguration.Default;
}
// Configuration parameter kept for backward compatibility but not used
}
/// <inheritdoc/>
@@ -48,8 +37,8 @@ public class EventOccurrenceParserService : IEventOccurrenceParserService
File.WriteAllText(tempFile, text, Encoding.UTF8);
var fileInfo = new FileInfo(tempFile);
// Use the existing EventOccurrenceParser with location configuration
var parser = new EventOccurrenceParser(fileInfo, events, _locationConfig);
// Use the existing EventOccurrenceParser
var parser = new EventOccurrenceParser(fileInfo, events);
var parserResult = parser.Parse();
// Copy occurrences from parser result
@@ -101,6 +90,9 @@ public class EventOccurrenceParserService : IEventOccurrenceParserService
// Copy parsing issues from parser result
result.Issues.AddRange(parserResult.Issues);
// Copy skipped HS section headers from parser result
result.SkippedHSSectionHeaders.AddRange(parserResult.SkippedHSSectionHeaders);
}
finally
{