19e5ef0675
This commit introduces a new property to track skipped high school section headers in the EventOccurrenceParseResult and EventOccurrenceParserResult classes. The EventOccurrenceParser has been updated to gracefully skip HS section headers that do not match any event definitions, improving the parsing logic. Additionally, the LocationParsingConfiguration has been removed from the EventOccurrenceParser, simplifying its constructor. Unit tests have been updated to reflect these changes and ensure correct behavior during parsing.
92 lines
3.9 KiB
C#
92 lines
3.9 KiB
C#
using System.Text.RegularExpressions;
|
||
|
||
namespace Core.Parsers.EventOccurrence;
|
||
|
||
/// <summary>
|
||
/// Parses time and location from combined strings.
|
||
/// Extracts time using regex, then uses everything after the time as the location.
|
||
/// </summary>
|
||
public static class TimeLocationParser
|
||
{
|
||
// Shared time value pattern: matches either NOON or a time with AM/PM (e.g., "10:30 a.m.", "3 p.m.")
|
||
private static string TimeValuePattern => TimePatterns.TimeValue;
|
||
|
||
// Regex to match time ranges like "10:30 a.m. - 12:00 p.m." or "10:30 a.m. - NOON"
|
||
// Matches: time1 (optional dash time2/NOON), then location
|
||
// The time group captures the full time range (including " - NOON" if present)
|
||
// Note: Input is normalized via SanitizeInput, so only regular hyphens need to be handled
|
||
private static readonly Regex TimeLocationRegex = new(
|
||
$@"(?<Time>{TimeValuePattern}(?:\s*-\s*{TimeValuePattern})?)(?:\s+(?<Location>.+))?",
|
||
RegexOptions.Compiled | RegexOptions.IgnoreCase);
|
||
|
||
// Pattern for cleaning time components from location text
|
||
// Matches optional dash, whitespace, time pattern, optional whitespace at start
|
||
// Handles: "- 12:15 p.m. ", "12:15 p.m. ", "- NOON ", "NOON ", etc.
|
||
private static readonly Regex TimeInLocationPattern = new(
|
||
$@"^(?:-\s*)?{TimeValuePattern}(?:\s+|$)",
|
||
RegexOptions.Compiled | RegexOptions.IgnoreCase);
|
||
|
||
/// <summary>
|
||
/// Parses time and location from the timeAndLocation string.
|
||
/// Extracts time using regex, then uses everything after the time as the location (after cleaning time fragments).
|
||
/// </summary>
|
||
/// <param name="timeAndLocation">The combined time and location string.</param>
|
||
/// <param name="time">Output parameter: the parsed time string.</param>
|
||
/// <param name="location">Output parameter: the parsed location string.</param>
|
||
public static void Parse(
|
||
string timeAndLocation,
|
||
out string time,
|
||
out string location)
|
||
{
|
||
// Extract time using regex
|
||
var timeLocationMatch = TimeLocationRegex.Match(timeAndLocation);
|
||
|
||
if (!timeLocationMatch.Success)
|
||
{
|
||
// If time regex doesn't match, use the whole string as time
|
||
time = timeAndLocation.Trim();
|
||
location = string.Empty;
|
||
return;
|
||
}
|
||
|
||
time = timeLocationMatch.Groups["Time"].Captures[0].Value.Trim();
|
||
var locationPart = timeLocationMatch.Groups["Location"].Success
|
||
? timeLocationMatch.Groups["Location"].Captures[0].Value.Trim()
|
||
: string.Empty;
|
||
|
||
// No location part found, which is valid (some events might not have locations)
|
||
if (string.IsNullOrWhiteSpace(locationPart))
|
||
{
|
||
location = string.Empty;
|
||
return;
|
||
}
|
||
|
||
// Clean location of any remaining time fragments
|
||
// (e.g., "– 12:15 p.m. Exhibit Hall C" -> "Exhibit Hall C")
|
||
location = CleanLocationText(locationPart);
|
||
}
|
||
|
||
/// <summary>
|
||
/// Cleans location text by removing any remaining time components from the start.
|
||
/// Handles cases like "- 12:15 p.m. Exhibit Hall C" -> "Exhibit Hall C"
|
||
/// Note: Input is normalized, so only regular hyphens need to be handled.
|
||
/// </summary>
|
||
public static string CleanLocationText(string locationText)
|
||
{
|
||
if (string.IsNullOrWhiteSpace(locationText))
|
||
return string.Empty;
|
||
|
||
// Remove time pattern from start, repeat until no more matches
|
||
string previous;
|
||
do
|
||
{
|
||
previous = locationText;
|
||
locationText = TimeInLocationPattern.Replace(locationText, "").Trim();
|
||
} while (locationText != previous && !string.IsNullOrWhiteSpace(locationText));
|
||
|
||
// If result is empty or only whitespace, return empty
|
||
return string.IsNullOrWhiteSpace(locationText) ? string.Empty : locationText;
|
||
}
|
||
}
|
||
|