The html page that needs to be parsed is structured without tables or divs. I need pull each hyperlink from this html page and the file modified text next to it (EX: h2_eh.xml23-Feb-2011 05:05)? What would be the best
approach? I would pull only the hyperlink filenames with .xml extensions.
This code works and doesnt blow up with the test case; ymmv when used in the field, it could certainly do with some more checks such has
the regex matched anything, is the next sibling the right node type, etc etc.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using HtmlAgilityPack;
using System.IO;
using System.Text.RegularExpressions;
public class FileAndDate
{
public string File { get; set; }
public string Date { get; set; }
}
namespace ExtractAllHrefFromHtmlSnippet
{
public partial class ParseExampleHtml : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
// load snippet
HtmlDocument htmlSnippet = new HtmlDocument();
htmlSnippet = LoadHtmlSnippetFromFile();
// extract hrefs
List hrefTags = new List();
hrefTags = ExtractAllAHrefTags(htmlSnippet);
// bind to gridview
GridViewHrefs.DataSource = hrefTags;
GridViewHrefs.DataBind();
}
///
/// Load the html snippet from the txt file
///
private HtmlDocument LoadHtmlSnippetFromFile()
{
TextReader reader = File.OpenText(Server.MapPath("~/App_Data/Sample.html"));
HtmlDocument doc = new HtmlDocument();
doc.Load(reader);
reader.Close();
return doc;
}
///
/// Extract all anchor tags using HtmlAgilityPack
///
///
///
private List ExtractAllAHrefTags(HtmlDocument htmlSnippet)
{
List hrefTags = new List();
foreach (HtmlNode link in htmlSnippet.DocumentNode.SelectNodes("//a[@href]"))
{
HtmlAttribute att = link.Attributes["href"];
if (att.Value.ToLower().EndsWith(".xml"))
{
string NextSibling = link.NextSibling.InnerText;
Regex r = new Regex(@"[\d]{2}-[A-Z][a-z]{2}-[\d]{4}");
Match match = r.Match(NextSibling);
hrefTags.Add(new FileAndDate() { File = att.Value, Date = match.Value });
}
}
return hrefTags;
}
}
}