Initial commit — AllMusicGuide scraper and music metadata tagger

This commit is contained in:
2026-05-10 02:49:16 +00:00
commit 4a541ca04b
194 changed files with 46364 additions and 0 deletions
@@ -0,0 +1,14 @@
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client
{
public class AlbumClient : XmlClientBase
{
public override XmlDocument Read(string albumId)
{
var settings = AlbumScraper.Default;
return ReadFromCriteria(settings.AlbumUrl, albumId);
}
}
}
@@ -0,0 +1,14 @@
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client
{
public class AlbumResultPageClient : XmlClientBase
{
public override XmlDocument Read(string criteria)
{
var settings = AlbumResultPageScraper.Default;
return ReadFromCriteria(settings.AlbumResultUrl, criteria);
}
}
}
@@ -0,0 +1,14 @@
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client
{
public class ArtistClient : XmlClientBase
{
public override XmlDocument Read(string artistId)
{
var settings = ArtistScaper.Default;
return ReadFromCriteria(settings.ArtistUrl, artistId);
}
}
}
@@ -0,0 +1,14 @@
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client
{
public class ArtistDiscographyAlbumPageClient : XmlClientBase
{
public override XmlDocument Read(string artistId)
{
var settings = ArtistDiscographyPageScraper.Default;
return ReadFromCriteria(settings.DiscographyAlbumUrl, artistId);
}
}
}
@@ -0,0 +1,14 @@
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client
{
public class ArtistDiscographyCompilationPageClient : XmlClientBase
{
public override XmlDocument Read(string artistId)
{
var settings = ArtistDiscographyPageScraper.Default;
return ReadFromCriteria(settings.DiscographyCompilationUrl, artistId);
}
}
}
@@ -0,0 +1,14 @@
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client
{
public class ArtistDiscographyEpSinglesPageClient : XmlClientBase
{
public override XmlDocument Read(string artistId)
{
var settings = ArtistDiscographyPageScraper.Default;
return ReadFromCriteria(settings.DiscographyEpSinglesUrl, artistId);
}
}
}
@@ -0,0 +1,14 @@
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client
{
public class ArtistResultPageClient : XmlClientBase
{
public override XmlDocument Read(string criteria)
{
var settings = ArtistResultPageScraper.Default;
return ReadFromCriteria(settings.ArtistResultUrl, criteria);
}
}
}
@@ -0,0 +1,9 @@
using System.Xml;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client
{
public interface IXmlClientBase
{
XmlDocument Read(string criteria);
}
}
@@ -0,0 +1,14 @@
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client
{
public class SongClient : XmlClientBase
{
public override XmlDocument Read(string criteria)
{
var settings = SongScraper.Default;
return ReadFromCriteria(settings.SongUrl, criteria);
}
}
}
@@ -0,0 +1,14 @@
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client
{
public class SongResultPageClient : XmlClientBase
{
public override XmlDocument Read(string criteria)
{
var settings = SongResultPageScraper.Default;
return ReadFromCriteria(settings.SongResultUrl, criteria);
}
}
}
@@ -0,0 +1,129 @@
using System;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading;
using System.Xml;
using HtmlAgilityPack;
using MusicMetaTagger.Core.Model;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client
{
public static class WebXmlClient
{
/// <summary>
/// Returns the HTML grabbed from the passed in URL.
/// </summary>
/// <param name="url"> The URL. </param>
/// <returns> A string containing html </returns>
public static string GetHtml(Uri url)
{
var tries = 0;
while (true)
{
try
{
var client = new WebClient();
using (var data = client.OpenRead(url))
using (var reader = new StreamReader(data))
{
return reader.ReadToEnd();
}
}
catch (Exception ex)
{
if (tries++ > 3)
throw new WebException(ex.Message + " : " + url, ex);
Thread.Sleep(100);
}
}
}
public static void SaveFile(Uri url, string filename)
{
var client = new WebClient();
client.DownloadFile(url, filename);
}
/// <summary>
/// Returns the HTML parsed into a standard XmlDocument. Uses the HtmlAgilityPack library for "out of the web" (poorly formatted) html file support.
/// </summary>
/// <param name="html"> The HTML </param>
/// <returns> An XmlDocument from the HTML </returns>
public static XmlDocument GetHtmlXml(string html)
{
// Sometimes the biography will have "<..." in it. Annoying.
html = html.Replace("<...", "");
html = html.Replace("<class=\"subtitle\">", "");
html = html.Replace("< ", "");
var htmlDoc = new HtmlDocument {OptionOutputAsXml = true};
htmlDoc.LoadHtml(html);
// this element will have unencoded entities in an attribute, messing up the xml
var selectSingleNode = htmlDoc.DocumentNode.SelectSingleNode("//*[@class='rovi-share']");
if (selectSingleNode != null)
selectSingleNode.Remove();
selectSingleNode = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='similar-albums']");
if (selectSingleNode != null)
selectSingleNode.Remove();
// fix for ajax results having more than one root node
if (htmlDoc.DocumentNode.ChildNodes.All(n => n.Name != "html"))
{
var el = htmlDoc.CreateElement("html");
var nodes = htmlDoc.DocumentNode.ChildNodes;
el.AppendChildren(nodes);
htmlDoc.DocumentNode.RemoveAllChildren();
htmlDoc.DocumentNode.AppendChild(el);
}
var xmlDocument = new XmlDocument();
using (Stream stream = new MemoryStream())
using (var xmlTextWriter = new XmlTextWriter(stream, Encoding.UTF8))
{
htmlDoc.Save(xmlTextWriter);
stream.Seek(0, SeekOrigin.Begin);
using (var streamReader = new StreamReader(stream))
xmlDocument.Load(streamReader);
}
return xmlDocument;
}
/// <summary>
/// Returns the HTML grabbed from the passed in URL parsed into a standard XmlDocument. Uses the HtmlAgilityPack library for "out of the web" html file support.
/// </summary>
/// <param name="url"> The URL. </param>
/// <returns> An XmlDocument from the HTML </returns>
public static XmlDocument GetHtmlXml(Uri url)
{
return GetHtmlXml(GetHtml(url));
}
// TODO: Refactor!
/// <summary>
/// Downloads the cover art to a local file. If the album doesn't have cover art, this will return null.
/// </summary>
/// <exception cref="HttpException">Thrown on HTTP error.</exception>
/// <returns> </returns>
public static FileInfo DownloadCoverArt(Album album)
{
if (String.IsNullOrEmpty(album.CoverUrl))
return null;
if (album.CoverUrl.Contains("no_cover"))
return null;
var directoryInfo = new DirectoryInfo(".\\ImageCache");
if (!Directory.Exists(directoryInfo.FullName))
directoryInfo.Create();
var fileInfo = new FileInfo(directoryInfo.FullName + "\\" + album.AlbumId + ".jpg");
if (!fileInfo.Exists)
SaveFile(new Uri(album.CoverUrl), fileInfo.FullName);
return fileInfo;
}
}
}
@@ -0,0 +1,22 @@
using System;
using System.Xml;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client
{
public abstract class XmlClientBase : IXmlClientBase
{
public abstract XmlDocument Read(string criteria);
protected static XmlDocument ReadFromCriteria(string urlFormat, string criteria)
{
if (String.IsNullOrEmpty(criteria))
return new XmlDocument();
//throw new ArgumentNullException("criteria");
criteria = Uri.EscapeUriString(criteria);
var url = String.Format(urlFormat, criteria);
return WebXmlClient.GetHtmlXml(new Uri(url));
}
}
}
@@ -0,0 +1,65 @@
using System.Collections.Generic;
using MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Scraper;
using MusicMetaTagger.Core.Model;
using MusicMetaTagger.Core.Services;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess
{
/// <summary>
/// Inspired by this Perl module: http://backpan.perl.org/authors/id/Y/YO/YOHAMED/WWW-AllMusicGuide.html
/// </summary>
public class MusicGuideScraper : IMusicGuide
{
private readonly IScraper<Album> _albumScraper;
private readonly IScraper<List<AlbumResult>> _albumResultScraper;
private readonly IScraper<Artist> _artistScraper;
private readonly IScraper<List<ArtistResult>> _artistResultScraper;
private readonly IScraper<List<ArtistDiscography>> _discographyScraper;
private readonly IScraper<List<SongResult>> _songResultScraper;
private readonly IScraper<Song> _songScraper;
public MusicGuideScraper(IScraper<Album> albumScraper, IScraper<List<AlbumResult>> albumResultScraper, IScraper<Artist> artistScraper, IScraper<List<ArtistResult>> artistResultScraper, IScraper<List<ArtistDiscography>> discographyScraper, IScraper<List<SongResult>> songResultScraper, IScraper<Song> songScraper)
{
_albumScraper = albumScraper;
_albumResultScraper = albumResultScraper;
_artistScraper = artistScraper;
_artistResultScraper = artistResultScraper;
_discographyScraper = discographyScraper;
_songResultScraper = songResultScraper;
_songScraper = songScraper;
}
public Artist GetArtist(string artistId)
{
var artist = _artistScraper.Scrape(artistId);
artist.Discography = _discographyScraper.Scrape(artistId);
return artist;
}
public Album GetAlbum(string albumId)
{
return _albumScraper.Scrape(albumId);
}
public Song GetSong(string songId)
{
return _songScraper.Scrape(songId);
}
public List<ArtistResult> SearchArtist(string artistName)
{
return _artistResultScraper.Scrape(artistName);
}
public List<AlbumResult> SearchAlbum(string albumName)
{
return _albumResultScraper.Scrape(albumName);
}
public List<SongResult> SearchSong(string songName)
{
return _songResultScraper.Scrape(songName);
}
}
}
@@ -0,0 +1,94 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
using MusicMetaTagger.Core.Model;
using MusicMetaTagger.Core.Utility;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser
{
public class AlbumParser : ParserBase<Album>
{
public override Album Parse(XmlDocument xml)
{
var album = new Album();
var settings = AlbumScraper.Default;
if (XpathMatch(xml, settings.ArtistUrlXpath))
{
var artistUrl = XpathMatchString(xml, settings.ArtistUrlXpath);
album.ArtistId = RegexCapture(artistUrl, settings.ArtistUrlIdRegex);
album.ArtistName = XpathMatchString(xml, settings.ArtistXpath);
}
else
{
album.VariousArtists = true;
album.ArtistName = "Various Artists";
}
if (XpathMatch(xml, settings.SoundtrackXpath))
{
album.Soundtrack = true;
}
var albumId = XpathMatchString(xml, settings.AlbumIdXpath);
album.AlbumId = StringUtility.CleanId(albumId);
album.Title = XpathMatchString(xml, settings.TitleXpath);
if (XpathMatch(xml, settings.ReleaseDateXpath))
album.ReleaseDate = XpathMatchString(xml, settings.ReleaseDateXpath);
//rating
if (XpathMatch(xml, settings.RatingXpath))
{
float rating;
if (float.TryParse(XpathMatchString(xml, settings.RatingXpath), out rating))
album.Rating = Convert.ToInt32(rating*2);
}
if (XpathMatch(xml, settings.PickXpath))
album.Pick = true;
if (XpathMatch(xml, settings.CoverUrlXpath))
{
var coverUrl = XpathMatchString(xml, settings.CoverUrlXpath);
album.CoverUrl = RegexCapture(coverUrl, settings.CoverUrlRegex);
}
if (XpathMatch(xml, settings.ReviewXpath))
album.Review = XpathMatchString(xml, settings.ReviewXpath);
if (XpathMatch(xml, settings.ReviewerXpath))
{
var reviewer = XpathMatchString(xml, settings.ReviewerXpath);
album.Reviewer = RegexCapture(reviewer, settings.ReviewerRegex).Trim();
}
if (XpathMatch(xml, settings.StylesXpath))
album.Styles = XpathMatchStringList(xml, settings.StylesXpath);
if (XpathMatch(xml, settings.MoodsXpath))
album.Moods = XpathMatchStringList(xml, settings.MoodsXpath);
if (XpathMatch(xml, settings.ThemesXpath))
album.Themes = XpathMatchStringList(xml, settings.ThemesXpath);
album.Tracks = new List<Track>();
if (XpathMatch(xml, settings.TrackXpath))
{
var trackParser = new TrackParser();
var tracks =
from tXml in XpathMatchXmlList(xml, settings.TrackXpath)
let t = trackParser.Parse(tXml)
where t.PerformerIds.Any() && t.TrackLength > TimeSpan.MinValue
select t;
foreach (var track in tracks)
{
track.AlbumId = album.AlbumId;
album.Tracks.Add(track);
}
}
return album;
}
}
}
@@ -0,0 +1,27 @@
using System.Collections.Generic;
using System.Linq;
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
using MusicMetaTagger.Core.Model;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser
{
public class AlbumResultPageParser : ParserBase<List<AlbumResult>>
{
public override List<AlbumResult> Parse(XmlDocument xml)
{
var albumResults = new List<AlbumResult>();
var settings = AlbumResultPageScraper.Default;
var albumResultScraper = new AlbumResultParser();
if (XpathMatch(xml, settings.AlbumXpath))
{
var albumResultInfoXmlList = XpathMatchXmlList(xml, settings.AlbumXpath);
albumResults.AddRange(albumResultInfoXmlList.Select(albumResultScraper.Parse));
var i = 1;
albumResults.ForEach(ar => ar.ResultOrder = i++);
}
return albumResults;
}
}
}
@@ -0,0 +1,51 @@
using System;
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
using MusicMetaTagger.Core.Model;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser
{
public class AlbumResultParser : ParserBase<AlbumResult>
{
public override AlbumResult Parse(XmlDocument xml)
{
var albumResult = new AlbumResult();
var settings = AlbumResultScraper.Default;
var albumUrl = XpathMatchString(xml, settings.AlbumUrlXpath);
albumResult.AlbumId = RegexCapture(albumUrl, settings.AlbumUrlIdRegex);
if (XpathMatch(xml, settings.InfoXpath))
{
var infoRaw = XpathMatchString(xml, settings.InfoXpath);
if (RegexMatch(infoRaw, settings.YearRegex))
{
var year = RegexCapture(infoRaw, settings.YearRegex);
int yearInt;
if (Int32.TryParse(year, out yearInt))
albumResult.Year = yearInt;
}
if (RegexMatch(infoRaw, settings.GenreRegex))
{
var genre = RegexCapture(infoRaw, settings.GenreRegex);
albumResult.Genre = genre.Trim();
}
}
if (XpathMatch(xml, settings.ArtistUrlXpath))
{
var artistId = XpathMatchString(xml, settings.ArtistUrlXpath);
albumResult.ArtistId = RegexCapture(artistId, settings.ArtistUrIdRegex);
albumResult.ArtistName = XpathMatchString(xml, settings.ArtistXpath);
}
else if (XpathMatch(xml, settings.ArtistNoUrlXpath))
{
albumResult.ArtistName = XpathMatchString(xml, settings.ArtistNoUrlXpath);
}
albumResult.AlbumTitle = XpathMatchString(xml, settings.TitleXpath);
return albumResult;
}
}
}
@@ -0,0 +1,58 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
using MusicMetaTagger.Core.Model;
using MusicMetaTagger.Core.Utility;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser
{
public class ArtistDiscographyPageParser : ParserBase<List<ArtistDiscography>>
{
public override List<ArtistDiscography> Parse(XmlDocument xml)
{
var artistDiscographies = new List<ArtistDiscography>();
var settings = ArtistDiscographyPageScraper.Default;
if (XpathMatch(xml, settings.DiscographyXpath))
{
var releaseTypeStr = XpathMatchString(xml, settings.ReleaseTypeXpath);
releaseTypeStr = RegexCapture(releaseTypeStr, settings.ReleaseTypeRegex);
ReleaseType releaseType;
switch (releaseTypeStr)
{
case "main":
releaseType = ReleaseType.Album;
break;
case "singles":
releaseType = ReleaseType.Single;
break;
case "compilations":
releaseType = ReleaseType.Compilation;
break;
default:
throw new ApplicationException("Unknown release type");
}
var discographyParser = new ArtistDiscographyParser(releaseType);
artistDiscographies.AddRange(
from albumXml in XpathMatchXmlList(xml, settings.DiscographyXpath)
let album = discographyParser.Parse(albumXml)
where album.Year > 0
select album);
var artistId = StringUtility.CleanId(XpathMatchString(xml, settings.ArtistIdXpath));
var artistName = XpathMatchString(xml, settings.ArtistXpath);
foreach (var artistDiscography in artistDiscographies)
{
artistDiscography.ArtistId = artistId;
artistDiscography.ArtistName = artistName;
}
}
return artistDiscographies;
}
}
}
@@ -0,0 +1,66 @@
using System;
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
using MusicMetaTagger.Core.Model;
using MusicMetaTagger.Core.Utility;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser
{
public class ArtistDiscographyParser : ParserBase<ArtistDiscography>
{
private readonly ReleaseType _releaseType;
public ArtistDiscographyParser(ReleaseType releaseType)
{
_releaseType = releaseType;
}
public override ArtistDiscography Parse(XmlDocument xml)
{
var artistDiscography = new ArtistDiscography();
var settings = ArtistDiscographyScraper.Default;
var albumUrl = XpathMatchString(xml, settings.AlbumUrlXpath);
artistDiscography.AlbumId = RegexCapture(albumUrl, settings.AlbumUrlIdRegex);
var year = XpathMatchString(xml, settings.YearXpath);
if (year.Length > 0 && StringUtility.IsNumeric(year))
artistDiscography.Year = Convert.ToInt32(XpathMatchString(xml, settings.YearXpath));
//rating
if (XpathMatch(xml, settings.RatingXpath))
{
var ratingString = XpathMatchString(xml, settings.RatingXpath);
if (!string.IsNullOrEmpty(ratingString))
{
ratingString = RegexCapture(ratingString, settings.RatingRegex);
int rating;
if (int.TryParse(ratingString, out rating))
artistDiscography.Rating = Convert.ToInt32(rating);
}
}
artistDiscography.Pick = XpathMatch(xml, settings.PickXpath);
if (XpathMatch(xml, settings.LabelXpath))
artistDiscography.Label = XpathMatchString(xml, settings.LabelXpath);
artistDiscography.AlbumTitle = XpathMatchString(xml, settings.TitleXpath);
artistDiscography.ReleaseType = _releaseType;
// EPs will have "[EP]" after the title
if (RegexMatch(artistDiscography.AlbumTitle, settings.ReleaseTypeRegex))
{
var releaseType = RegexCapture(artistDiscography.AlbumTitle, settings.ReleaseTypeRegex);
if (releaseType == "EP")
artistDiscography.ReleaseType = ReleaseType.EP;
}
else if (artistDiscography.AlbumTitle.Contains("Box Set"))
artistDiscography.ReleaseType = ReleaseType.BoxSet;
// TODO: add "has review"
return artistDiscography;
}
}
}
@@ -0,0 +1,30 @@
using System.Collections.Generic;
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
using MusicMetaTagger.Core.Model;
using MusicMetaTagger.Core.Utility;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser
{
public class ArtistParser : ParserBase<Artist>
{
public override Artist Parse(XmlDocument artistXml)
{
var scraper = ArtistScaper.Default;
var artist = new Artist();
artist.ArtistId = StringUtility.CleanId(XpathMatchString(artistXml, scraper.ArtistIdXpath));
artist.ArtistName = XpathMatchString(artistXml, scraper.ArtistXpath);
if (XpathMatch(artistXml, scraper.GenreXpath))
artist.Genre = XpathMatchString(artistXml, scraper.GenreXpath);
artist.Styles =
XpathMatch(artistXml, scraper.StylesXpath)
? XpathMatchStringList(artistXml, scraper.StylesXpath)
: new List<string>();
return artist;
}
}
}
@@ -0,0 +1,27 @@
using System.Collections.Generic;
using System.Linq;
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
using MusicMetaTagger.Core.Model;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser
{
public class ArtistResultPageParser : ParserBase<List<ArtistResult>>
{
public override List<ArtistResult> Parse(XmlDocument xml)
{
var artistResults = new List<ArtistResult>();
var settings = ArtistResultPageScraper.Default;
var artistResultParser = new ArtistResultParser();
if (XpathMatch(xml, settings.ArtistXpath))
{
var artistResultsXmlList = XpathMatchXmlList(xml, settings.ArtistXpath);
artistResults = artistResultsXmlList.Select(artistResultParser.Parse).ToList();
var i = 1;
artistResults.ForEach(ar => ar.ResultOrder = i++);
}
return artistResults;
}
}
}
@@ -0,0 +1,35 @@
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
using MusicMetaTagger.Core.Model;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser
{
public class ArtistResultParser : ParserBase<ArtistResult>
{
public override ArtistResult Parse(XmlDocument xml)
{
var artistResult = new ArtistResult();
var settings = ArtistResultScraper.Default;
var artistId = XpathMatchString(xml, settings.ArtistUrlXpath);
artistResult.ArtistId = RegexCapture(artistId, settings.ArtistUrIdRegex);
artistResult.ArtistName = XpathMatchString(xml, settings.ArtistXpath);
if (XpathMatch(xml, settings.InfoXpath))
{
var infoRaw = XpathMatchString(xml, settings.InfoXpath);
if (RegexMatch(infoRaw, settings.YearsActiveRegex))
{
var yearsActive = RegexCapture(infoRaw, settings.YearsActiveRegex);
artistResult.YearsActive = yearsActive.Trim();
}
if (RegexMatch(infoRaw, settings.GenreRegex))
{
var genre = RegexCapture(infoRaw, settings.GenreRegex);
artistResult.Genre = genre.Trim();
}
}
return artistResult;
}
}
}
@@ -0,0 +1,9 @@
using System.Xml;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser
{
public interface IParser<out TEntity>
{
TEntity Parse(XmlDocument xml);
}
}
@@ -0,0 +1,89 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text.RegularExpressions;
using System.Xml;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser
{
public abstract class ParserBase<TEntity> : IParser<TEntity>
{
public abstract TEntity Parse(XmlDocument xml);
protected static string RegexCapture(string str, string regexStr)
{
var regex = new Regex(regexStr);
var match = regex.Match(str);
if (match.Captures.Count == 0)
throw new ApplicationException("Couldn't make regex match for "
+ regexStr + " in " + str);
return match.Captures[0].Value;
}
protected static bool RegexMatch(string str, string regexStr)
{
var regex = new Regex(regexStr);
var match = regex.Match(str);
return match.Captures.Count > 0;
}
protected static bool XpathMatch(XmlDocument xml, string xPath)
{
var nodes = xml.SelectNodes(xPath);
return nodes != null && nodes.Count > 0;
}
protected static string XpathMatchString(XmlDocument xml, string xPath)
{
var titleNodes = xml.SelectNodes(xPath);
if (titleNodes == null || titleNodes.Count == 0)
{
//xml.Save(@"problem_" + DateTime.Now.Ticks + ".xml");
throw new ApplicationException(string.Format("Couldn't find matching nodes for : \"{0}\" in {1}", xPath,
xml.InnerXml));
}
var decodedValue = WebUtility.HtmlDecode(titleNodes[0].InnerText);
return decodedValue.Trim();
}
protected static List<string> XpathMatchStringList(XmlDocument xml, string xPath)
{
var titleNodes = xml.SelectNodes(xPath);
if (titleNodes == null || titleNodes.Count == 0)
throw new ApplicationException("Couldn't find matching nodes for : " + xPath);
return (
from XmlNode node in titleNodes
select WebUtility.HtmlDecode(node.InnerText)
into decodedValue
select decodedValue.Trim()
).ToList();
}
protected static IEnumerable<XmlDocument> XpathMatchXmlList(XmlDocument xml, string xPath)
{
var titleNodes = xml.SelectNodes(xPath);
if (titleNodes == null || titleNodes.Count == 0)
{
xml.Save("problem.xml");
throw new ApplicationException("Couldn't find matching nodes for : " + xPath);
}
var xmlList = new List<XmlDocument>();
foreach (XmlNode node in titleNodes)
{
var xmlDocument = new XmlDocument();
xmlDocument.LoadXml(xml.FirstChild.InnerXml + "<root></root>");
var importNode = xmlDocument.ImportNode(node, true);
if (xmlDocument.DocumentElement == null)
throw new ApplicationException("xmlDocument.DocumentElement Null");
xmlDocument.DocumentElement.AppendChild(importNode);
xmlList.Add(xmlDocument);
}
return xmlList;
}
}
}
@@ -0,0 +1,74 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
using MusicMetaTagger.Core.Model;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser
{
public class SongAppearsOnAlbumParser : ParserBase<SongAppearsOnAlbum>
{
public override SongAppearsOnAlbum Parse(XmlDocument xml)
{
var albumResult = new SongAppearsOnAlbum();
var settings = SongAppearsOnAlbumScraper.Default;
var albumUrl = XpathMatchString(xml, settings.AlbumUrlXpath);
albumResult.AlbumId = RegexCapture(albumUrl, settings.AlbumUrlIdRegex);
if (XpathMatch(xml, settings.YearXpath))
{
var yearRaw = XpathMatchString(xml, settings.YearXpath);
if (RegexMatch(yearRaw, settings.YearRegex))
{
var year = RegexCapture(yearRaw, settings.YearRegex);
int yearInt;
if (Int32.TryParse(year, out yearInt))
albumResult.Year = yearInt;
}
}
if (XpathMatch(xml, settings.ArtistUrlXpath))
{
var artistId = XpathMatchString(xml, settings.ArtistUrlXpath);
albumResult.ArtistId = RegexCapture(artistId, settings.ArtistUrIdRegex);
albumResult.ArtistName = XpathMatchString(xml, settings.ArtistXpath);
}
else if (XpathMatch(xml, settings.ArtistNoUrlXpath))
{
albumResult.ArtistName = XpathMatchString(xml, settings.ArtistNoUrlXpath);
}
albumResult.AlbumTitle = XpathMatchString(xml, settings.TitleXpath);
return albumResult;
}
}
public class SongParser : ParserBase<Song>
{
public override Song Parse(XmlDocument xml)
{
var song = new Song();
var settings = SongScraper.Default;
song.SongTitle = XpathMatchString(xml, settings.SongTitleXpath);
var songUrlId = XpathMatchString(xml, settings.SongUrlIdXpath);
song.SongId = RegexCapture(songUrlId, settings.SongUrlIdRegex);
song.AppearsOnAlbum = new List<SongAppearsOnAlbum>();
var songAppearsOnAlbumParser = new SongAppearsOnAlbumParser();
if (XpathMatch(xml, settings.AlbumXpath))
{
var albumResultInfoXmlList = XpathMatchXmlList(xml, settings.AlbumXpath);
song.AppearsOnAlbum.AddRange(albumResultInfoXmlList.Select(songAppearsOnAlbumParser.Parse));
var i = 1;
song.AppearsOnAlbum.ForEach(ar => ar.ResultOrder = i++);
}
return song;
}
}
}
@@ -0,0 +1,27 @@
using System.Collections.Generic;
using System.Linq;
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
using MusicMetaTagger.Core.Model;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser
{
public class SongResultPageParser : ParserBase<List<SongResult>>
{
public override List<SongResult> Parse(XmlDocument xml)
{
var songResults = new List<SongResult>();
var settings = SongResultPageScraper.Default;
var songResultScraper = new SongResultParser();
if (XpathMatch(xml, settings.SongXpath))
{
var songResultInfoXmlList = XpathMatchXmlList(xml, settings.SongXpath);
songResults.AddRange(songResultInfoXmlList.Select(songResultScraper.Parse));
var i = 1;
songResults.ForEach(ar => ar.ResultOrder = i++);
}
return songResults;
}
}
}
@@ -0,0 +1,31 @@
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
using MusicMetaTagger.Core.Model;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser
{
public class SongResultParser : ParserBase<SongResult>
{
public override SongResult Parse(XmlDocument xml)
{
var songResult = new SongResult();
var settings = SongResultScraper.Default;
songResult.SongTitle = XpathMatchString(xml, settings.SongTitleXpath);
if (RegexMatch(songResult.SongTitle, settings.SongTitleRegex))
songResult.SongTitle = RegexCapture(songResult.SongTitle, settings.SongTitleRegex);
var songUrl = XpathMatchString(xml, settings.SongUrlIdXpath);
songResult.SongId = RegexCapture(songUrl, settings.SongUrlIdRegex);
if (XpathMatch(xml, settings.ArtistUrlXpath))
{
var artistId = XpathMatchString(xml, settings.ArtistUrlXpath);
songResult.ArtistId = RegexCapture(artistId, settings.ArtistUrIdRegex);
songResult.ArtistName = XpathMatchString(xml, settings.ArtistXpath);
}
return songResult;
}
}
}
@@ -0,0 +1,58 @@
using System;
using System.Collections.Generic;
using System.Xml;
using MusicMetaTagger.Client.AllMusicGuide.Properties;
using MusicMetaTagger.Core.Model;
using MusicMetaTagger.Core.Utility;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser
{
public class TrackParser : ParserBase<Track>
{
public override Track Parse(XmlDocument xml)
{
var track = new Track();
var settings = TrackScraper.Default;
if (XpathMatch(xml, settings.TrackUrlXpath))
{
var trackUrl = XpathMatchString(xml, settings.TrackUrlXpath);
track.TrackId = RegexCapture(trackUrl, settings.TrackUrlIdRegex);
}
track.TrackNumber = Convert.ToInt32(XpathMatchString(xml, settings.TrackNumberXpath));
track.Pick = XpathMatch(xml, settings.PickXpath);
track.TrackTitle =
XpathMatch(xml, settings.TitleXpath)
? XpathMatchString(xml, settings.TitleXpath)
: string.Empty;
if (XpathMatch(xml, settings.ComposerXpath))
track.Composer =
String.Join(", ",
XpathMatchStringList(xml, settings.ComposerXpath).ToArray());
track.PerformerIds = new List<string>();
var performerIdXpath = settings.PerformerIdXpath;
if (!XpathMatch(xml, settings.PerformerIdXpath))
performerIdXpath = settings.PerformerIdAlternateXpath;
if (XpathMatch(xml, performerIdXpath))
{
var performerIdsRaw = XpathMatchStringList(xml, performerIdXpath);
foreach (var performerIdRaw in performerIdsRaw)
{
if (RegexMatch(performerIdRaw, settings.PerformerIdRegex))
track.PerformerIds.Add(RegexCapture(performerIdRaw, settings.PerformerIdRegex));
}
}
var timeString = XpathMatchString(xml, settings.TimeXpath);
track.TrackLength = StringUtility.ConvertTimeSpan(timeString);
return track;
}
}
}
@@ -0,0 +1,11 @@
using System.Collections.Generic;
using MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client;
using MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser;
using MusicMetaTagger.Core.Model;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Scraper
{
public class AlbumResultPageScraper : ScraperBase<List<AlbumResult>, AlbumResultPageClient, AlbumResultPageParser>
{
}
}
@@ -0,0 +1,10 @@
using MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client;
using MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser;
using MusicMetaTagger.Core.Model;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Scraper
{
public class AlbumScraper : ScraperBase<Album, AlbumClient, AlbumParser>
{
}
}
@@ -0,0 +1,24 @@
using System.Collections.Generic;
using System.Linq;
using MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client;
using MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser;
using MusicMetaTagger.Core.Model;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Scraper
{
public class ArtistDiscographyScraper : IScraper<List<ArtistDiscography>>
{
public List<ArtistDiscography> Scrape(string artistId)
{
var parser = new ArtistDiscographyPageParser();
var xmlClients =
new IXmlClientBase[]
{
new ArtistDiscographyAlbumPageClient(),
new ArtistDiscographyCompilationPageClient(),
new ArtistDiscographyEpSinglesPageClient()
};
return xmlClients.Select(c => c.Read(artistId)).SelectMany(parser.Parse).ToList();
}
}
}
@@ -0,0 +1,11 @@
using System.Collections.Generic;
using MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client;
using MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser;
using MusicMetaTagger.Core.Model;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Scraper
{
public class ArtistResultPageScraper : ScraperBase<List<ArtistResult>, ArtistResultPageClient, ArtistResultPageParser>
{
}
}
@@ -0,0 +1,19 @@
using MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client;
using MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser;
using MusicMetaTagger.Core.Model;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Scraper
{
public class ArtistScraper : ScraperBase<Artist, ArtistClient, ArtistParser>
{
public new Artist Scrape(string artistId)
{
var artist = base.Scrape(artistId);
var artistDiscographyScraper = new ArtistDiscographyScraper();
artist.Discography = artistDiscographyScraper.Scrape(artistId);
return artist;
}
}
}
@@ -0,0 +1,7 @@
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Scraper
{
public interface IScraper<out TEntity>
{
TEntity Scrape(string criteria);
}
}
@@ -0,0 +1,16 @@
using MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client;
using MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Scraper
{
public abstract class ScraperBase<TEntity, TClient, TParser> : IScraper<TEntity>
where TClient : IXmlClientBase, new()
where TParser : IParser<TEntity>, new()
{
public TEntity Scrape(string criteria)
{
var xmlDocument = new TClient().Read(criteria);
return new TParser().Parse(xmlDocument);
}
}
}
@@ -0,0 +1,11 @@
using System.Collections.Generic;
using MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client;
using MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser;
using MusicMetaTagger.Core.Model;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Scraper
{
public class SongResultPageScraper : ScraperBase<List<SongResult>, SongResultPageClient, SongResultPageParser>
{
}
}
@@ -0,0 +1,10 @@
using MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Client;
using MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Parser;
using MusicMetaTagger.Core.Model;
namespace MusicMetaTagger.Client.AllMusicGuide.RemoteDataAccess.Scraper
{
public class SongScraper : ScraperBase<Song, SongClient, SongParser>
{
}
}