
李浩 - 云代码空间
——
anglesharp:https://github.com/anglesharp/anglesharp
anglesharp是一个.net库,使您能够解析基于尖括号的超文本,例如html,svg和mathml,该库还支持未经验证的xml,anglesharp的一个重要方面是css也可以解析。
anglesharp与类似的库(例如htmlagilitypack)相比的优势在于:
anglesharp库专注于标准合规性,交互性和可扩展性。因此,它为使用c#的web开发人员提供了从在任何现代浏览器中使用dom所获得的所有可能性。

官方实例:https://github.com/anglesharp/anglesharp.samples
这个简单的示例将使用wikipedia的网站进行数据检索。
var config = configuration.default.withdefaultloader(); var address = "https://en.wikipedia.org/wiki/list_of_the_big_bang_theory_episodes"; var context = browsingcontext.new(config); var document = await context.openasync(address); var cellselector = "tr.vevent td:nth-child(3)"; var cells = document.queryselectorall(cellselector); var titles = cells.select(m => m.textcontent);
//创建一个(可重用)解析器前端
var parser = new htmlparser();
//html dom节点
var source = "
<h1>some example source</h1>
<p>this is a paragraph element</p>
";
//解析源文件
var document = parser.parse(source);
//创建p标签
var p = document.createelement("p");
p.textcontent = "this is another paragraph.";
//添加到dom
document.body.appendchild(p);
//返回完整html
var html = document.documentelement.outerhtml;
viewdata["html"] = html;
效果展示

给标签添加自定义属性
var parser = new htmlparser();
//为以下源代码生成html dom
var document = parser.parse("
<ul>
<li>first element</li>
<li>second element</li>
<li>third</li>
<li class='bla'>last</li>
</ul>
");
//获取所有li元素并将test属性设置为值测试
var elements = document.queryselectorall("li").attr("test", "test");
//元素仍然包含所有li元素
viewdata["html"] = document.documentelement.outerhtml;
效果展示


var parser = new htmlparser();
var document = parser.parsedocument(text);
using (var writer = new stringwriter())
{
document.tohtml(writer, new prettymarkupformatter
{
indentation = "\t",
newline = "\n"
});
var indentedtext = writer.tostring();
}
var requester = new defaulthttprequester("mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) chrome/69.0.3497.100 safari/537.36");
requester.headers.add("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
requester.headers.add("referer", "");
requester.headers.add("accept-language", "zh-hans-cn,zh-hans;q=0.8,en-us;q=0.5,en;q=0.3");
var context = browsingcontext.new(configuration.default.withlocalebasedencoding().withdefaultloader().withdefaultcookies().with(requester));
//根据虚拟请求/响应模式创建文档
var document = context.openasync(url).result;
using (var writer = new stringwriter())
{
document.tohtml(writer, new prettymarkupformatter
{
indentation = "\t",
newline = "\n"
});
var indentedtext = writer.tostring();
}
新建一个belle类用于保存获取的图片信息
///
/// 解析html
///
public class belle
{
///
/// 标题
///
public string title { get; set; }
///
/// 图片地址
///
public string imageurl { get; set; }
}
// 设置配置以支持文档加载
var config = configuration.default.withdefaultloader();
// 豆瓣地址
var address = "https://www.dbmeinv.com/dbgroup/show.htm?cid=4";
// 请求豆辨网
var document = browsingcontext.new(config).openasync(address);
// 根据class获取html元素
var cells = document.result.queryselectorall(".panel-body li");
// we are only interested in the text - select it with linq
list list = new list();
foreach (var item in cells)
{
var belle = new belle
{
title= item.queryselector("img").getattribute("title"),
imageurl= item.queryselector("img").getattribute("src")
};
list.add(belle);
}
viewdata["html"] = list;