Last active
August 1, 2021 22:48
-
-
Save michaelrinderle/008d151774a228bd534eab6e3478c67a to your computer and use it in GitHub Desktop.
Html Page Dom Parse
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
namespace HtmlParser | |
{ | |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
try | |
{ | |
using var parser = new DomParser("https://www.michaelrinderle.com"); | |
PageDom dom = parser.GetPageDom(); | |
} | |
catch (Exception ex) | |
{ | |
Console.WriteLine(ex); | |
} | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using HtmlAgilityPack; | |
using System; | |
using System.Collections.Generic; | |
using System.Linq; | |
using System.Net; | |
using System.Text; | |
namespace HtmlParser | |
{ | |
public class DomParser : IDisposable | |
{ | |
string Url; | |
HtmlDocument Document; | |
PageDom Dom; | |
private bool _disposed; | |
public DomParser(string url) | |
{ | |
this.Load(url, string.Empty); | |
} | |
public DomParser(string url, string pageSource) | |
{ | |
this.Load(url, pageSource); | |
} | |
public void Load(string url, string pageSource) | |
{ | |
this.Url = url; | |
this.Document = new(); | |
if (string.IsNullOrEmpty(pageSource)) | |
{ | |
try | |
{ | |
var request = (HttpWebRequest)WebRequest.Create(url); | |
request.Method = "GET"; | |
using var response = (HttpWebResponse)request.GetResponse(); | |
using var stream = response.GetResponseStream(); | |
this.Document.Load(stream, Encoding.GetEncoding("iso-8859-9")); | |
} | |
catch | |
{ | |
throw new Exception("Cannot retrieve page source."); | |
} | |
} | |
else | |
this.Document.LoadHtml(pageSource); | |
if (this.Document.ParseErrors == null) | |
throw new Exception("Invalid Page Source"); | |
} | |
public PageDom GetPageDom() | |
{ | |
this.Dom = new(this.Url); | |
this.Dom.Domain = new Uri(this.Url).Host; | |
this.ParseMetaTags(); | |
this.ParseHeaderTags(); | |
this.ParsePTags(); | |
this.ParseImgUrls(); | |
this.ParseUrls(); | |
return this.Dom; | |
} | |
private void ParseMetaTags() | |
{ | |
this.Dom.Title = this.Document.DocumentNode | |
.Descendants("title").SingleOrDefault().InnerText; | |
this.Dom.Language = this.Document.DocumentNode | |
.SelectSingleNode("//meta[@name='language']").Attributes["content"].Value; | |
this.Dom.Author = this.Document.DocumentNode | |
.SelectSingleNode("//meta[@name='author']").Attributes["content"].Value; | |
this.Dom.Description = this.Document.DocumentNode | |
.SelectSingleNode("//meta[@name='description']").Attributes["content"].Value; | |
this.Dom.Keywords = this.Document.DocumentNode | |
.SelectSingleNode("//meta[@name='keywords']").Attributes["content"].Value; | |
this.Dom.Robots = this.Document.DocumentNode | |
.SelectSingleNode("//meta[@name='robots']").Attributes["content"].Value; | |
} | |
private void ParseHeaderTags() | |
{ | |
this.Dom.H1 = this.Document.DocumentNode | |
.Descendants("h1").Select(x => x.InnerText).ToList(); | |
this.Dom.H2 = this.Document.DocumentNode | |
.Descendants("h2").Select(x => x.InnerText).ToList(); | |
this.Dom.H3 = this.Document.DocumentNode | |
.Descendants("h3").Select(x => x.InnerText).ToList(); | |
this.Dom.H4 = this.Document.DocumentNode | |
.Descendants("h4").Select(x => x.InnerText).ToList(); | |
this.Dom.H5 = this.Document.DocumentNode | |
.Descendants("h5").Select(x => x.InnerText).ToList(); | |
this.Dom.H6 = this.Document.DocumentNode | |
.Descendants("h6").Select(x => x.InnerText).ToList(); | |
} | |
private void ParsePTags() | |
{ | |
this.Dom.P = this.Document.DocumentNode | |
.Descendants("p").Select(x => x.InnerText).ToList(); | |
} | |
private void ParseImgUrls() | |
{ | |
List<string> imageUrls = this.Document.DocumentNode | |
.Descendants("img").Select(x => x.Attributes["src"].Value).ToList(); | |
for (int i = imageUrls.Count - 1; i >= 0; i--) | |
{ | |
// remove embedded image | |
if(imageUrls[i].Contains("data:image")) | |
{ | |
imageUrls.RemoveAt(i); | |
continue; | |
} | |
// check for absolute path | |
Uri result; | |
Uri.TryCreate(imageUrls[i], UriKind.Absolute, out result); | |
// try to make absolute path | |
if (result == null) | |
Uri.TryCreate(new Uri(new Uri(this.Dom.Domain), imageUrls[i]).AbsolutePath, UriKind.Absolute, out result); | |
// update if successful | |
if (result != null) | |
{ | |
imageUrls[i] = result.AbsoluteUri.Replace("file", "http"); | |
} | |
else imageUrls.RemoveAt(i); | |
} | |
this.Dom.ImageUrls = imageUrls; | |
} | |
private void ParseUrls() | |
{ | |
List<string> blacklist = new() { "mailto:", "javascript(0);", ".pdf", ".zip", ".doc"}; | |
List<string> urls = new(); | |
foreach (var link in this.Document.DocumentNode.SelectNodes("//a[@href]")) | |
{ | |
try | |
{ | |
urls.Add(link.GetAttributeValue("href", string.Empty)); | |
} | |
catch { continue; } | |
} | |
// remove bad paths and fix/add relative paths | |
for (int i = urls.Count - 1; i >= 0; i--) | |
{ | |
if (string.IsNullOrEmpty(urls[i])) | |
{ | |
urls.RemoveAt(i); | |
continue; | |
} | |
// blacklist keywords | |
if (blacklist.Any(x => urls[i].Contains(x))) | |
{ | |
urls.RemoveAt(i); | |
continue; | |
} | |
// check for absolute path | |
Uri result; | |
Uri.TryCreate(urls[i], UriKind.Absolute, out result); | |
// try to make absolute path | |
if (result == null) | |
Uri.TryCreate(new Uri(new Uri(this.Dom.Domain), urls[i]).AbsoluteUri, UriKind.Absolute, out result); | |
// update if successful | |
if (result != null) | |
{ | |
urls[i] = result.AbsoluteUri; | |
} | |
else urls.RemoveAt(i); | |
} | |
// parse queries | |
this.Dom.QueryUrls = new(); | |
for (int i = urls.Count - 1; i >= 0; i--) | |
{ | |
if (urls[i].Contains("?")) | |
{ | |
this.Dom.QueryUrls.Add(new Uri(urls[i]).Query); | |
urls.RemoveAt(i); | |
} | |
} | |
this.Dom.InternalUrls = new(); | |
this.Dom.ExternalUrls = new(); | |
// sort internal / external links | |
for (int i = urls.Count - 1; i >= 0; i--) | |
{ | |
if (!urls[i].Contains(this.Dom.Domain)) | |
this.Dom.ExternalUrls.Add(urls[i]); | |
else | |
this.Dom.InternalUrls.Add(urls[i]); | |
urls.RemoveAt(i); | |
} | |
} | |
public void Dispose() | |
{ | |
if (!_disposed) | |
{ | |
_disposed = true; | |
this.Dom = null; | |
this.Document = null; | |
GC.SuppressFinalize(this); | |
} | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Collections.Generic; | |
namespace HtmlParser | |
{ | |
public class PageDom | |
{ | |
public PageDom(string url) | |
{ | |
this.Url = url; | |
} | |
// General Dom Info | |
public string Url { get; set; } | |
public string Domain { get; set; } | |
// Page Meta Elements | |
public string Title { get; set; } | |
public string Language { get; set; } | |
public string Author { get; set; } | |
public string Description { get; set; } | |
public string Keywords { get; set; } | |
public string Robots { get; set; } | |
// Page Element Lists | |
public List<string> H1 { get; set; } | |
public List<string> H2 { get; set; } | |
public List<string> H3 { get; set; } | |
public List<string> H4 { get; set; } | |
public List<string> H5 { get; set; } | |
public List<string> H6 { get; set; } | |
public List<string> P { get; set; } | |
// Page Urls | |
public List<string> ImageUrls { get; set; } | |
public List<string> InternalUrls { get; set; } | |
public List<string> ExternalUrls { get; set; } | |
public List<string> QueryUrls { get; set; } | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment