Enhance event occurrence parsing to skip unmatched high school section headers
This commit introduces a new property to track skipped high school section headers in the EventOccurrenceParseResult and EventOccurrenceParserResult classes. The EventOccurrenceParser has been updated to gracefully skip HS section headers that do not match any event definitions, improving the parsing logic. Additionally, the LocationParsingConfiguration has been removed from the EventOccurrenceParser, simplifying its constructor. Unit tests have been updated to reflect these changes and ensure correct behavior during parsing.
This commit is contained in:
@@ -30,6 +30,12 @@ public class EventOccurrenceParseResult
|
||||
/// </summary>
|
||||
public List<ParsingIssue> Issues { get; set; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// List of high school (HS) section headers that were encountered but skipped
|
||||
/// because they don't match any event definition in the system.
|
||||
/// </summary>
|
||||
public List<string> SkippedHSSectionHeaders { get; set; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Total number of event occurrences successfully parsed.
|
||||
/// </summary>
|
||||
@@ -110,11 +116,6 @@ public enum ParsingIssueType
|
||||
/// <summary>
|
||||
/// Invalid format or other parsing issue.
|
||||
/// </summary>
|
||||
InvalidFormat,
|
||||
|
||||
/// <summary>
|
||||
/// Location parsing failed (no matching pattern found).
|
||||
/// </summary>
|
||||
LocationParseFailure
|
||||
InvalidFormat
|
||||
}
|
||||
|
||||
|
||||
@@ -24,33 +24,19 @@ public static class LineClassifier
|
||||
/// <summary>
|
||||
/// Determines if a line is a continuation/wrapped line that should be skipped.
|
||||
/// These are typically lines that:
|
||||
/// - Start with lowercase or special characters (not event names)
|
||||
/// - Start with "*" (marks the start of a continuation block)
|
||||
/// - Are parenthetical notes like "(Semifinalists only)"
|
||||
/// - Are informational text like "Schedule Posted on..."
|
||||
/// </summary>
|
||||
public static bool IsContinuationLine(string line)
|
||||
{
|
||||
var trimmed = line.Trim();
|
||||
|
||||
// Skip parenthetical notes
|
||||
if (trimmed.StartsWith("(", StringComparison.Ordinal) && trimmed.EndsWith(")", StringComparison.Ordinal))
|
||||
// Check if line starts with "*" (marks continuation block start)
|
||||
if (trimmed.StartsWith("*", StringComparison.Ordinal))
|
||||
return true;
|
||||
|
||||
// Skip lines that are clearly continuation text (start with lowercase, common continuation words)
|
||||
if (trimmed.Length > 0 && char.IsLower(trimmed[0]))
|
||||
{
|
||||
// Check if it starts with common continuation words
|
||||
var continuationPrefixes = new[] { "be ", "the ", "and ", "or ", "to ", "a ", "an ", "will ", "may ", "can " };
|
||||
foreach (var prefix in continuationPrefixes)
|
||||
{
|
||||
if (trimmed.StartsWith(prefix, StringComparison.OrdinalIgnoreCase))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Skip informational lines that don't contain dates/times
|
||||
if (trimmed.Contains("Schedule Posted", StringComparison.OrdinalIgnoreCase) ||
|
||||
trimmed.Contains("Note:", StringComparison.OrdinalIgnoreCase))
|
||||
// Skip parenthetical notes
|
||||
if (trimmed.StartsWith("(", StringComparison.Ordinal) && trimmed.EndsWith(")", StringComparison.Ordinal))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
using System.Text.RegularExpressions;
|
||||
using Core.Models;
|
||||
|
||||
namespace Core.Parsers.EventOccurrence;
|
||||
|
||||
/// <summary>
|
||||
/// Parses time and location from combined strings.
|
||||
/// Handles time ranges, location extraction, and pattern matching.
|
||||
/// Extracts time using regex, then uses everything after the time as the location.
|
||||
/// </summary>
|
||||
public static class TimeLocationParser
|
||||
{
|
||||
@@ -28,21 +27,18 @@ public static class TimeLocationParser
|
||||
RegexOptions.Compiled | RegexOptions.IgnoreCase);
|
||||
|
||||
/// <summary>
|
||||
/// Parses time and location from the timeAndLocation string using configurable location patterns.
|
||||
/// Parses time and location from the timeAndLocation string.
|
||||
/// Extracts time using regex, then uses everything after the time as the location (after cleaning time fragments).
|
||||
/// </summary>
|
||||
/// <param name="timeAndLocation">The combined time and location string.</param>
|
||||
/// <param name="locationConfig">The location parsing configuration with patterns.</param>
|
||||
/// <param name="time">Output parameter: the parsed time string.</param>
|
||||
/// <param name="location">Output parameter: the parsed location string.</param>
|
||||
/// <param name="locationParseSuccess">Output parameter: whether location parsing was successful.</param>
|
||||
public static void Parse(
|
||||
string timeAndLocation,
|
||||
LocationParsingConfiguration? locationConfig,
|
||||
out string time,
|
||||
out string location,
|
||||
out bool locationParseSuccess)
|
||||
out string location)
|
||||
{
|
||||
// Try to separate time from location using the time regex
|
||||
// Extract time using regex
|
||||
var timeLocationMatch = TimeLocationRegex.Match(timeAndLocation);
|
||||
|
||||
if (!timeLocationMatch.Success)
|
||||
@@ -50,7 +46,6 @@ public static class TimeLocationParser
|
||||
// If time regex doesn't match, use the whole string as time
|
||||
time = timeAndLocation.Trim();
|
||||
location = string.Empty;
|
||||
locationParseSuccess = false;
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -63,61 +58,12 @@ public static class TimeLocationParser
|
||||
if (string.IsNullOrWhiteSpace(locationPart))
|
||||
{
|
||||
location = string.Empty;
|
||||
locationParseSuccess = true; // Consider it a success since no location is needed
|
||||
return;
|
||||
}
|
||||
|
||||
// Clean up location part - remove any remaining time components
|
||||
// Clean location of any remaining time fragments
|
||||
// (e.g., "– 12:15 p.m. Exhibit Hall C" -> "Exhibit Hall C")
|
||||
locationPart = CleanLocationText(locationPart);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(locationPart))
|
||||
{
|
||||
location = string.Empty;
|
||||
locationParseSuccess = true; // No location after cleaning is also valid
|
||||
return;
|
||||
}
|
||||
|
||||
// Try to match location using configurable patterns
|
||||
(location, locationParseSuccess) = TryMatchLocation(locationPart, locationConfig);
|
||||
|
||||
// If no pattern matched but we have a location, use it anyway
|
||||
// This allows parsing to continue while still tracking that the location didn't match a pattern
|
||||
if (!locationParseSuccess)
|
||||
{
|
||||
location = locationPart;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Attempts to match a location string against configured patterns.
|
||||
/// </summary>
|
||||
private static (string location, bool success) TryMatchLocation(
|
||||
string locationPart,
|
||||
LocationParsingConfiguration? locationConfig)
|
||||
{
|
||||
// No patterns configured - can't match
|
||||
if (locationConfig == null || !locationConfig.LocationPatterns.Any())
|
||||
{
|
||||
return (string.Empty, false);
|
||||
}
|
||||
|
||||
// Try initial match
|
||||
var location = LocationPatternMatcher.Match(locationPart, locationConfig.LocationPatterns);
|
||||
if (!string.IsNullOrEmpty(location))
|
||||
{
|
||||
return (location, true);
|
||||
}
|
||||
|
||||
// Try matching against trimmed version (handles extra whitespace)
|
||||
var cleanedForMatching = locationPart.Trim();
|
||||
location = LocationPatternMatcher.Match(cleanedForMatching, locationConfig.LocationPatterns);
|
||||
if (!string.IsNullOrEmpty(location))
|
||||
{
|
||||
return (cleanedForMatching, true);
|
||||
}
|
||||
|
||||
return (string.Empty, false);
|
||||
location = CleanLocationText(locationPart);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -13,19 +13,18 @@ public class EventOccurrenceParserResult
|
||||
{
|
||||
public IDictionary<EventDefinition, List<Entities.EventOccurrence>> Occurrences { get; set; } = new Dictionary<EventDefinition, List<Entities.EventOccurrence>>();
|
||||
public List<ParsingIssue> Issues { get; set; } = new();
|
||||
public List<string> SkippedHSSectionHeaders { get; set; } = new();
|
||||
}
|
||||
|
||||
public class EventOccurrenceParser
|
||||
{
|
||||
private FileSystemInfo _txtFile;
|
||||
private ICollection<EventDefinition> _events;
|
||||
private LocationParsingConfiguration? _locationConfig;
|
||||
|
||||
public EventOccurrenceParser(FileSystemInfo txtFile, ICollection<EventDefinition> events, LocationParsingConfiguration? locationConfig = null)
|
||||
public EventOccurrenceParser(FileSystemInfo txtFile, ICollection<EventDefinition> events)
|
||||
{
|
||||
_events = events;
|
||||
_txtFile = txtFile;
|
||||
_locationConfig = locationConfig;
|
||||
}
|
||||
|
||||
public EventOccurrenceParserResult Parse()
|
||||
@@ -34,6 +33,8 @@ public class EventOccurrenceParser
|
||||
var occurrences = result.Occurrences;
|
||||
var issues = result.Issues;
|
||||
EventDefinition? currentEventDefinition = null;
|
||||
bool inContinuationMode = false;
|
||||
bool inHSSection = false;
|
||||
|
||||
var lines = File.ReadLines(_txtFile.FullName);
|
||||
foreach (var (line, index) in lines.Select((line, index) => (line, index + 1)))
|
||||
@@ -44,11 +45,19 @@ public class EventOccurrenceParser
|
||||
|
||||
// Skip empty lines
|
||||
if (EventOccurrenceParsers.LineClassifier.IsEmptyLine(normalizedLine))
|
||||
{
|
||||
// Empty lines break continuation mode
|
||||
inContinuationMode = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip comment lines (starting with "#") - use grammar parser
|
||||
if (EventOccurrenceParsers.LineClassifier.IsCommentLine(normalizedLine))
|
||||
{
|
||||
// Comment lines break continuation mode
|
||||
inContinuationMode = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Try to parse occurrence line using grammar parser
|
||||
var occurrenceLine = EventOccurrenceGrammar.TryParseOccurrenceLine(normalizedLine);
|
||||
@@ -61,10 +70,23 @@ public class EventOccurrenceParser
|
||||
{
|
||||
var (eventNamePart, schoolLevel) = sectionHeader.Value;
|
||||
|
||||
// Section headers break continuation mode
|
||||
inContinuationMode = false;
|
||||
|
||||
// Use fuzzy matching to find the best matching event definition
|
||||
var evt = EventOccurrenceParsers.SectionHeaderMatcher.MatchEventDefinition(eventNamePart, _events);
|
||||
if (evt == null)
|
||||
{
|
||||
// Check if this is an HS event - if so, skip gracefully
|
||||
if (schoolLevel.Equals("HS", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
result.SkippedHSSectionHeaders.Add(normalizedLine);
|
||||
currentEventDefinition = null; // Skip subsequent occurrences
|
||||
inHSSection = true; // Mark that we're in an HS section
|
||||
continue; // No issue created
|
||||
}
|
||||
|
||||
// For non-HS unmatched headers, create issue as before
|
||||
var bestRatio = EventOccurrenceParsers.SectionHeaderMatcher.GetBestMatchRatio(eventNamePart, _events);
|
||||
issues.Add(new ParsingIssue
|
||||
{
|
||||
@@ -76,12 +98,16 @@ public class EventOccurrenceParser
|
||||
continue;
|
||||
}
|
||||
currentEventDefinition = evt;
|
||||
inHSSection = false; // Reset HS section flag for MS events
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check for General Schedule/Session using grammar parser
|
||||
if (EventOccurrenceParsers.SectionHeaderMatcher.IsGeneralSchedule(normalizedLine))
|
||||
{
|
||||
// General schedule breaks continuation mode
|
||||
inContinuationMode = false;
|
||||
inHSSection = false; // Reset HS section flag
|
||||
currentEventDefinition = EventDefinition.GeneralSchedule;
|
||||
continue;
|
||||
}
|
||||
@@ -89,9 +115,22 @@ public class EventOccurrenceParser
|
||||
// Also check for simple "MS" or "HS" in line (backward compatibility)
|
||||
if (EventOccurrenceParsers.SectionHeaderMatcher.HasSchoolLevel(normalizedLine))
|
||||
{
|
||||
// Section headers break continuation mode
|
||||
inContinuationMode = false;
|
||||
|
||||
var evt = EventOccurrenceParsers.SectionHeaderMatcher.MatchEventDefinition(normalizedLine, _events);
|
||||
if (evt == null)
|
||||
{
|
||||
// Check if this is an HS event - if so, skip gracefully
|
||||
if (normalizedLine.Contains("HS", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
result.SkippedHSSectionHeaders.Add(normalizedLine);
|
||||
currentEventDefinition = null; // Skip subsequent occurrences
|
||||
inHSSection = true; // Mark that we're in an HS section
|
||||
continue; // No issue created
|
||||
}
|
||||
|
||||
// For non-HS unmatched headers, create issue as before
|
||||
var bestRatio = EventOccurrenceParsers.SectionHeaderMatcher.GetBestMatchRatio(normalizedLine, _events);
|
||||
issues.Add(new ParsingIssue
|
||||
{
|
||||
@@ -103,11 +142,18 @@ public class EventOccurrenceParser
|
||||
continue;
|
||||
}
|
||||
currentEventDefinition = evt;
|
||||
inHSSection = false; // Reset HS section flag for MS events
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip continuation lines (lines that look like they're continuing from previous line)
|
||||
if (EventOccurrenceParsers.LineClassifier.IsContinuationLine(normalizedLine))
|
||||
// Check if line starts with "*" to enter continuation mode
|
||||
if (normalizedLine.TrimStart().StartsWith("*", StringComparison.Ordinal))
|
||||
{
|
||||
inContinuationMode = true;
|
||||
}
|
||||
|
||||
// Skip continuation lines (in continuation mode OR line starts with "*" or is parenthetical)
|
||||
if (inContinuationMode || EventOccurrenceParsers.LineClassifier.IsContinuationLine(normalizedLine))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
@@ -127,6 +173,15 @@ public class EventOccurrenceParser
|
||||
continue;
|
||||
}
|
||||
|
||||
// Occurrence lines break continuation mode
|
||||
inContinuationMode = false;
|
||||
|
||||
// Skip occurrences under HS sections (they won't match any event definition)
|
||||
if (inHSSection)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var (occurrenceName, month, dayOfMonthStr, timeAndLocation) = occurrenceLine.Value;
|
||||
|
||||
// Remove weekday suffix from occurrence name if present
|
||||
@@ -151,23 +206,8 @@ public class EventOccurrenceParser
|
||||
|
||||
// timeAndLocation is already normalized (hyphens normalized) since normalizedLine was sanitized
|
||||
|
||||
// Parse time and location using configurable patterns
|
||||
EventOccurrenceParsers.TimeLocationParser.Parse(timeAndLocation, _locationConfig, out string time, out string location, out bool locationParseSuccess);
|
||||
|
||||
// Track location parsing failure if patterns are configured but none matched
|
||||
if (!locationParseSuccess && !string.IsNullOrWhiteSpace(location))
|
||||
{
|
||||
if (_locationConfig != null && _locationConfig.LocationPatterns.Any())
|
||||
{
|
||||
issues.Add(new ParsingIssue
|
||||
{
|
||||
LineNumber = index,
|
||||
LineContent = normalizedLine,
|
||||
IssueType = ParsingIssueType.LocationParseFailure,
|
||||
Message = $"Location '{location}' does not match any configured pattern"
|
||||
});
|
||||
}
|
||||
}
|
||||
// Parse time and location - extract time using regex, then use everything after time as location
|
||||
EventOccurrenceParsers.TimeLocationParser.Parse(timeAndLocation, out string time, out string location);
|
||||
|
||||
// Parse date
|
||||
DateOnly? startDate = null;
|
||||
@@ -222,6 +262,9 @@ public class EventOccurrenceParser
|
||||
if (!occurrences.ContainsKey(eventDefinition))
|
||||
occurrences.Add(eventDefinition, []);
|
||||
occurrences[eventDefinition].Add(eventOccurrence);
|
||||
|
||||
// Reset HS section flag when we successfully parse an occurrence (means we're in a valid section)
|
||||
inHSSection = false;
|
||||
}
|
||||
|
||||
return result;
|
||||
|
||||
@@ -12,20 +12,9 @@ namespace Core.Services;
|
||||
/// </summary>
|
||||
public class EventOccurrenceParserService : IEventOccurrenceParserService
|
||||
{
|
||||
private readonly LocationParsingConfiguration? _locationConfig;
|
||||
|
||||
public EventOccurrenceParserService(IConfiguration? configuration = null)
|
||||
{
|
||||
// Load location parsing configuration from IConfiguration if provided
|
||||
if (configuration != null)
|
||||
{
|
||||
_locationConfig = configuration.GetSection("LocationParsingSettings").Get<LocationParsingConfiguration>()
|
||||
?? LocationParsingConfiguration.Default;
|
||||
}
|
||||
else
|
||||
{
|
||||
_locationConfig = LocationParsingConfiguration.Default;
|
||||
}
|
||||
// Configuration parameter kept for backward compatibility but not used
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
@@ -48,8 +37,8 @@ public class EventOccurrenceParserService : IEventOccurrenceParserService
|
||||
File.WriteAllText(tempFile, text, Encoding.UTF8);
|
||||
var fileInfo = new FileInfo(tempFile);
|
||||
|
||||
// Use the existing EventOccurrenceParser with location configuration
|
||||
var parser = new EventOccurrenceParser(fileInfo, events, _locationConfig);
|
||||
// Use the existing EventOccurrenceParser
|
||||
var parser = new EventOccurrenceParser(fileInfo, events);
|
||||
var parserResult = parser.Parse();
|
||||
|
||||
// Copy occurrences from parser result
|
||||
@@ -101,6 +90,9 @@ public class EventOccurrenceParserService : IEventOccurrenceParserService
|
||||
|
||||
// Copy parsing issues from parser result
|
||||
result.Issues.AddRange(parserResult.Issues);
|
||||
|
||||
// Copy skipped HS section headers from parser result
|
||||
result.SkippedHSSectionHeaders.AddRange(parserResult.SkippedHSSectionHeaders);
|
||||
}
|
||||
finally
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user