【26】使用webbroser和mshtml解析网页
2018-08-09 本文已影响25人
业余玩家
C#采集网页的方式:webclient,webbroser,httprequest,当然还有其他的方式,这次为了解析网页,主要使用了webbroser这种方式,这种方式也是比较简单的一种,但是感觉速度不行,可能是电脑或者网络的原因,其他的方式没有做尝试,下次再试试其他的两种。
首先,你需要在工具栏中找到webbroser这个控件,然后拖到窗口合适的位置,就可以使用它了。你可以加入一个网页地址输入栏,这样就可以获取到你输入地址的网页了。
2018-08-09_234743.png
webbroser.Navigate("你输入的url")将指定的文档加载到webbroser控件之中,执行方法之后你就能在控件中看到网页了,就相当于浏览器了,你还可以控制前进,后退,刷新网页,可以实现一个简单的浏览器了。
//开始请求网页,获取数据
private void Button_Click_2(object sender, RoutedEventArgs e)
{
if (datalist.Items.Count>0)
{
//foreach (Mydata item in datalist.Items) {
// GridViewColumn column = new GridViewColumn();
// column.Header = item.MYKEY;
// column.DisplayMemberBinding=
// resultgridview.Columns.Add(column);
//}
for (int i = 0; i < datalist.Items.Count; i++)
{
Mydata md = datalist.Items[i] as Mydata;
resultgridview.Columns[i].Header = md.MYKEY;
}
if (siteurl.Text != string.Empty)
{
//只是为了获取连接中的日期,并不通用
string[] arr = siteurl.Text.Split('/');
if (arr.Count() > 3)
{
string[] tmparr = arr[5].Split('.');
char[] tmpchar = tmparr[0].ToArray();
if (tmpchar.Count() == 6)
{
datetime = "20" + tmpchar[0] + tmpchar[1] + "/" + tmpchar[2] + tmpchar[3] + "/" + tmpchar[4] + tmpchar[5];
}
}
//将获取的文档加载到webbroser控件之中
webbroser.Navigate(siteurl.Text);
//防止重复获取数据
webbroser.LoadCompleted -= Broserfinished;
webbroser.LoadCompleted += Broserfinished;
}
}
}
webbroser.LoadCompleted+=执行的方法,网页加载完毕之后执行的方法,方便抓取网页的所有内容。
List<dataitems> myld = new List<dataitems>();
private void Broserfinished(object sender, NavigationEventArgs e)
{
var document = this.webbroser.Document as HTMLDocument;
//可根据id,tagname获取元素,和js一样。
var items= document.getElementsByTagName("div");
//这里根据classname获取值,并加入listview里面,这里写的不好,只是为了实现当时的需求,待完善。
List<dataitems> ld = new List<dataitems>();
dataitems dt = new dataitems();
foreach (IHTMLElement item in items)
{
for (int i = 0; i < datalist.Items.Count; i++)
{
Mydata md = datalist.Items[i] as Mydata;
if (item.innerText != null)
{
//目的是去掉不合规范的项
if (item.innerText.Contains("(") || !isint(item.innerText))
{
continue;
}
if (i == 0 && item.className == md.MYVALUE)
{
dt.ITEM1 = item.innerText;
}
if (i == 1 && item.className == md.MYVALUE)
{
dt.ITEM2 = item.innerText;
dt.ITEM3 = datetime;
ld.Add(dt);
dt = new dataitems();
}
}
}
}
//目的是去除不和规范的项
ld=ld.Where(X => X.ITEM1!=null).ToList();
foreach (dataitems tdt in ld)
{
resultdatalist.Items.Add(tdt);
}
}
private bool isint(string value)
{
return Regex.IsMatch(value, @"[0-9]");
}
MSHTML是微软公司的一个COM组件,该组件封装了HTML语言中的所有元素及其属性,通过其提供的标准接口,可以访问指定网页的所有元素。要使用它首先需要添加其引用,vs2017在程序集-扩展里面可以找到。
2018-08-10_001627.png
完整代码,仅供参考,没有写的很通用
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Windows;
using System.Windows.Controls;
using System.Windows.Data;
using System.Windows.Documents;
using System.Windows.Input;
using System.Windows.Media;
using System.Windows.Media.Imaging;
using System.Windows.Navigation;
using System.Windows.Shapes;
using Microsoft.Office.Interop.Excel;
using mshtml;
namespace GetData
{
/// <summary>
/// MainWindow.xaml 的交互逻辑
/// </summary>
public partial class MainWindow : System.Windows.Window
{
public class Mydata
{
private string mykey;
private string myvalue;
public string MYKEY
{
get
{
return mykey;
}
set
{
mykey = value;
}
}
public string MYVALUE
{
get
{
return myvalue;
}
set
{
myvalue = value;
}
}
}
public class dataitems
{
private string item1;
private string item2;
private string item3;
private string item4;
private string item5;
public string ITEM1
{
get
{
return item1;
}
set
{
item1 = value;
}
}
public string ITEM2
{
get
{
return item2;
}
set
{
item2 = value;
}
}
public string ITEM3
{
get
{
return item3;
}
set
{
item3 = value;
}
}
public string ITEM4
{
get
{
return item4;
}
set
{
item4 = value;
}
}
public string ITEM5
{
get
{
return item5;
}
set
{
item5 = value;
}
}
}
public MainWindow()
{
InitializeComponent();
}
/// <summary>
/// 添加抓取字段
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void Button_Click(object sender, RoutedEventArgs e)
{
if (selfname.Text != string.Empty && selfvalue.Text != string.Empty)
{
Mydata mydata = new Mydata();
mydata.MYKEY = selfname.Text;
mydata.MYVALUE = selfvalue.Text;
datalist.Items.Add(mydata);
selfname.Text = "";
selfvalue.Text = "";
}
else
{
MessageBox.Show("字段和规则不能为空");
}
}
/// <summary>
/// 删除字段
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void Button_Click_1(object sender, RoutedEventArgs e)
{
if (null != datalist.SelectedItem)
{
datalist.Items.Remove(datalist.SelectedItem);
}
}
string datetime = string.Empty;
private void Button_Click_2(object sender, RoutedEventArgs e)
{
if (datalist.Items.Count>0)
{
//foreach (Mydata item in datalist.Items) {
// GridViewColumn column = new GridViewColumn();
// column.Header = item.MYKEY;
// column.DisplayMemberBinding=
// resultgridview.Columns.Add(column);
//}
for (int i = 0; i < datalist.Items.Count; i++)
{
Mydata md = datalist.Items[i] as Mydata;
resultgridview.Columns[i].Header = md.MYKEY;
}
if (siteurl.Text != string.Empty)
{
string[] arr = siteurl.Text.Split('/');
if (arr.Count() > 3)
{
string[] tmparr = arr[5].Split('.');
char[] tmpchar = tmparr[0].ToArray();
if (tmpchar.Count() == 6)
{
datetime = "20" + tmpchar[0] + tmpchar[1] + "/" + tmpchar[2] + tmpchar[3] + "/" + tmpchar[4] + tmpchar[5];
}
}
webbroser.Navigate(siteurl.Text);
webbroser.LoadCompleted -= Broserfinished;
webbroser.LoadCompleted += Broserfinished;
}
}
}
List<dataitems> myld = new List<dataitems>();
private void Broserfinished(object sender, NavigationEventArgs e)
{
var document = this.webbroser.Document as HTMLDocument;
var items= document.getElementsByTagName("div");
List<dataitems> ld = new List<dataitems>();
dataitems dt = new dataitems();
foreach (IHTMLElement item in items)
{
for (int i = 0; i < datalist.Items.Count; i++)
{
Mydata md = datalist.Items[i] as Mydata;
if (item.innerText != null)
{
if (item.innerText.Contains("(") || !isint(item.innerText))
{
continue;
}
if (i == 0 && item.className == md.MYVALUE)
{
dt.ITEM1 = item.innerText;
}
if (i == 1 && item.className == md.MYVALUE)
{
dt.ITEM2 = item.innerText;
dt.ITEM3 = datetime;
ld.Add(dt);
dt = new dataitems();
}
}
}
}
ld=ld.Where(X => X.ITEM1!=null).ToList();
foreach (dataitems tdt in ld)
{
resultdatalist.Items.Add(tdt);
}
//mshtml.HTMLDocument dom =(mshtml.HTMLDocument)webbroser.Document;
//IHTMLDocument2 dom2 = (IHTMLDocument2)webbroser.DocumentText;
//foreach (IHTMLElement item in dom2.all)
//{
// if (item.className == selfvalue.Text)
// {
// MessageBox.Show(item.innerText);
// }
//}
}
private void export_click(object sender, RoutedEventArgs e)
{
if (resultdatalist.SelectedItems.Count > 0)
{
DateTime dtime =Convert.ToDateTime("2017-03-27");
System.Windows.Forms.SaveFileDialog sfd = new System.Windows.Forms.SaveFileDialog();
sfd.DefaultExt = "csv";
sfd.Filter = "Excel文件(*.csv)|*.csv";
sfd.RestoreDirectory = true;
sfd.CreatePrompt = false;
sfd.Title = "导出文件到";
sfd.ShowDialog();
string fileName = sfd.FileName;
Microsoft.Office.Interop.Excel.Application app = new Microsoft.Office.Interop.Excel.Application();
Workbook wk = app.Workbooks.Add(System.Type.Missing);
for (int i = 1; i <=resultdatalist.SelectedItems.Count; i++)
{
dataitems dt = resultdatalist.SelectedItems[i-1] as dataitems;
int ColumnIndex = 1;
app.Cells[i, ColumnIndex++] = dt.ITEM1;
app.Cells[i, ColumnIndex++] = dt.ITEM2;
app.Cells[i, ColumnIndex++] = dt.ITEM3;
}
wk.SaveAs(fileName); //将其进行保存到指定的路径
wk.Close();
}
else
{
MessageBox.Show("请选择导出内容");
}
}
private bool isint(string value)
{
return Regex.IsMatch(value, @"[0-9]");
}
}
}