Files
chapter-organizer/Core/Parsers/EventOccurrence/TimeLocationParser.cs
T
poprhythm f916cfad6b Refactor event occurrence parsing by introducing modular components for improved maintainability
This commit restructures the EventOccurrenceParser by breaking down its functionality into modular components, including EventDefinitionResolver, LineClassifier, LocationPatternMatcher, SectionHeaderMatcher, TimeLocationParser, and TimeParser. This refactoring enhances code readability and maintainability, allowing for easier updates and testing. Additionally, the TextUtil class has been updated to include input sanitization methods. Comprehensive unit tests have been added to ensure the correctness of the new parsing logic and to validate the handling of various event occurrence scenarios.
2026-01-08 20:23:57 -05:00

146 lines
5.9 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System.Text.RegularExpressions;
using Core.Models;
namespace Core.Parsers.EventOccurrence;
/// <summary>
/// Parses time and location from combined strings.
/// Handles time ranges, location extraction, and pattern matching.
/// </summary>
public static class TimeLocationParser
{
// Shared time value pattern: matches either NOON or a time with AM/PM (e.g., "10:30 a.m.", "3 p.m.")
private static string TimeValuePattern => TimePatterns.TimeValue;
// Regex to match time ranges like "10:30 a.m. - 12:00 p.m." or "10:30 a.m. - NOON"
// Matches: time1 (optional dash time2/NOON), then location
// The time group captures the full time range (including " - NOON" if present)
// Note: Input is normalized via SanitizeInput, so only regular hyphens need to be handled
private static readonly Regex TimeLocationRegex = new(
$@"(?<Time>{TimeValuePattern}(?:\s*-\s*{TimeValuePattern})?)(?:\s+(?<Location>.+))?",
RegexOptions.Compiled | RegexOptions.IgnoreCase);
// Pattern for cleaning time components from location text
// Matches optional dash, whitespace, time pattern, optional whitespace at start
// Handles: "- 12:15 p.m. ", "12:15 p.m. ", "- NOON ", "NOON ", etc.
private static readonly Regex TimeInLocationPattern = new(
$@"^(?:-\s*)?{TimeValuePattern}(?:\s+|$)",
RegexOptions.Compiled | RegexOptions.IgnoreCase);
/// <summary>
/// Parses time and location from the timeAndLocation string using configurable location patterns.
/// </summary>
/// <param name="timeAndLocation">The combined time and location string.</param>
/// <param name="locationConfig">The location parsing configuration with patterns.</param>
/// <param name="time">Output parameter: the parsed time string.</param>
/// <param name="location">Output parameter: the parsed location string.</param>
/// <param name="locationParseSuccess">Output parameter: whether location parsing was successful.</param>
public static void Parse(
string timeAndLocation,
LocationParsingConfiguration? locationConfig,
out string time,
out string location,
out bool locationParseSuccess)
{
// Try to separate time from location using the time regex
var timeLocationMatch = TimeLocationRegex.Match(timeAndLocation);
if (!timeLocationMatch.Success)
{
// If time regex doesn't match, use the whole string as time
time = timeAndLocation.Trim();
location = string.Empty;
locationParseSuccess = false;
return;
}
time = timeLocationMatch.Groups["Time"].Captures[0].Value.Trim();
var locationPart = timeLocationMatch.Groups["Location"].Success
? timeLocationMatch.Groups["Location"].Captures[0].Value.Trim()
: string.Empty;
// No location part found, which is valid (some events might not have locations)
if (string.IsNullOrWhiteSpace(locationPart))
{
location = string.Empty;
locationParseSuccess = true; // Consider it a success since no location is needed
return;
}
// Clean up location part - remove any remaining time components
// (e.g., " 12:15 p.m. Exhibit Hall C" -> "Exhibit Hall C")
locationPart = CleanLocationText(locationPart);
if (string.IsNullOrWhiteSpace(locationPart))
{
location = string.Empty;
locationParseSuccess = true; // No location after cleaning is also valid
return;
}
// Try to match location using configurable patterns
(location, locationParseSuccess) = TryMatchLocation(locationPart, locationConfig);
// If no pattern matched but we have a location, use it anyway
// This allows parsing to continue while still tracking that the location didn't match a pattern
if (!locationParseSuccess)
{
location = locationPart;
}
}
/// <summary>
/// Attempts to match a location string against configured patterns.
/// </summary>
private static (string location, bool success) TryMatchLocation(
string locationPart,
LocationParsingConfiguration? locationConfig)
{
// No patterns configured - can't match
if (locationConfig == null || !locationConfig.LocationPatterns.Any())
{
return (string.Empty, false);
}
// Try initial match
var location = LocationPatternMatcher.Match(locationPart, locationConfig.LocationPatterns);
if (!string.IsNullOrEmpty(location))
{
return (location, true);
}
// Try matching against trimmed version (handles extra whitespace)
var cleanedForMatching = locationPart.Trim();
location = LocationPatternMatcher.Match(cleanedForMatching, locationConfig.LocationPatterns);
if (!string.IsNullOrEmpty(location))
{
return (cleanedForMatching, true);
}
return (string.Empty, false);
}
/// <summary>
/// Cleans location text by removing any remaining time components from the start.
/// Handles cases like "- 12:15 p.m. Exhibit Hall C" -> "Exhibit Hall C"
/// Note: Input is normalized, so only regular hyphens need to be handled.
/// </summary>
public static string CleanLocationText(string locationText)
{
if (string.IsNullOrWhiteSpace(locationText))
return string.Empty;
// Remove time pattern from start, repeat until no more matches
string previous;
do
{
previous = locationText;
locationText = TimeInLocationPattern.Replace(locationText, "").Trim();
} while (locationText != previous && !string.IsNullOrWhiteSpace(locationText));
// If result is empty or only whitespace, return empty
return string.IsNullOrWhiteSpace(locationText) ? string.Empty : locationText;
}
}