原文发布时间为:2009-11-15 —— 来源于本人的百度文章 [由搬家工具导入]
.NET抓取数据范例 抓取页面上所有的链接
前台:
<%@ Page Language="C#" AutoEventWireup="true" CodeFile="Default.aspx.cs" Inherits="_Default" %>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "">
<html xmlns=""><head runat="server"> <title>Untitled Page</title></head><body> <form id="form1" runat="server"> <div> <asp:TextBox ID="TextBox1" runat="server" Width="481px">http://hi.baidu.com/handboy</asp:TextBox> <asp:Button ID="Button1" runat="server" OnClick="Button1_Click" Text="提取" /> <br /> <asp:TextBox ID="TextBox2" runat="server" Height="304px" TextMode="MultiLine" Width="524px"></asp:TextBox> </div> </form></body></html>后台:
using System;
using System.Web.UI.WebControls;using System.Net;using System.IO;using System.Collections;using System.Text;using System.Text.RegularExpressions;public partial class _Default : System.Web.UI.Page
{ protected void Page_Load(object sender, EventArgs e) {}
protected void Button1_Click(object sender, EventArgs e) { TextBox2.Text = ""; string web_url = this.TextBox1.Text; string all_code = ""; HttpWebRequest all_codeRequest = (HttpWebRequest)WebRequest.Create(web_url); WebResponse all_codeResponse = all_codeRequest.GetResponse(); StreamReader the_Reader = new StreamReader(all_codeResponse.GetResponseStream(),Encoding.GetEncoding("GB2312")); all_code = the_Reader.ReadToEnd(); the_Reader.Close(); ArrayList my_list = new ArrayList(); string p = @"- ./?%&=]*)?"; Regex re = new Regex(p, RegexOptions.IgnoreCase); MatchCollection mc = re.Matches(all_code);for (int i = 0; i <= mc.Count - 1; i++)
{ bool _foo = false; string name = mc[i].ToString(); foreach (string list in my_list) { if (name == list) { _foo = true; break; }}//过滤
if (!_foo)
{ TextBox2.Text += name + "\n"; } } }}