
·您现在的位置: 云翼网络 >> 文章中心 >> 网站建设 >> 网站建设开发 >> ASP.NET网站开发 >> 随手正则写的CSDN【只看楼主】功能
写这个的时候居然没有看到原来CSDN已经有这个功能了,写完代码了突然发现原来早就已经有了。
现把代码贴出来吧,虽然有很多解析HTML的开源类库如:http://htmlagilitypack.codeplex.com/,但我一直习惯于正则匹配。
截图:

呵呵,起码还能看吧@——#
1 PRivate void button1_Click(object sender, EventArgs e)
2 {
3 if (!string.IsNullOrEmpty(txtCsdnUrl.Text.Trim()))
4 {
5 string url = txtCsdnUrl.Text.Trim();
6 string htmlSource = string.Empty;
7 htmlSource = GetHtmlSource(url);
8 int pageCount = GetPageCount(htmlSource);
9 string context = string.Empty;
10
11 if (pageCount > 1)
12 {
13 for (int i = 1; i <= pageCount; i++)
14 {
15 htmlSource = GetHtmlSource(url + "?page=" + i);
16
17 context+= GetLZArticle(htmlSource);
18 }
19 }
20 else
21 {
22 context += GetLZArticle(htmlSource);
23 }
24
25 richTextBox1.Text = context;
26
27 }
28 else
29 {
30 MessageBox.Show("请输入地址");
31 }
32 }
33
34 /// <summary>
35 /// 获取源代码
36 /// </summary>
37 /// <param name="Url"></param>
38 /// <returns></returns>
39 public string GetHtmlSource(string Url)
40 {
41 WebClient client = new WebClient();
42 client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
43 Stream data = client.OpenRead(Url);
44 string result = string.Empty;
45 using (StreamReader reader = new StreamReader(data, Encoding.UTF8))
46 {
47 result = reader.ReadToEnd();
48 }
49
50 return result;
51 }
52
53 /// <summary>
54 /// 获取贴子总页数 URL格式:http://bbs.csdn.net/topics/390730011?page=2
55 /// </summary>
56 /// <returns>返回最大页数</returns>
57 public int GetPageCount(string HtmlSource)
58 {
59 int pageCount = 0;
60
61 Regex reg = new Regex("<select class=\"jumpMenu\" name=\"jumpMenu\">(?<val>.*?)</select>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
62 string htmlSource = HtmlSource;
63 Regex reg1 = new Regex("<option.*?>(?<val>.*?)</option>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
64 int count = reg1.Matches(reg.Match(htmlSource).Groups["val"].Value).Count;
65
66 int.TryParse(reg1.Matches(reg.Match(htmlSource).Groups["val"].Value)[count - 1].Groups["val"].Value,
67 out pageCount);
68
69 return pageCount;
70 }
71
72 /// <summary>
73 /// 获取文章标题
74 /// </summary>
75 /// <param name="HtmlSource">网页内容</param>
76 /// <returns></returns>
77 public string GetArticleTitle(string HtmlSource)
78 {
79 string title = string.Empty;
80
81 Regex reg = new Regex("<span class=\"title text_overflow\">(?<title>.*?)</span>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
82
83 title = reg.Match(HtmlSource).Groups["title"].Value;
84
85 return title;
86 }
87
88
89 public string GetAuthorName(string HtmlSource)
90 {
91 string result = string.Empty;
92
93 Regex regex = new Regex("<a class=\"p-author\" href=\"#\">(?<value>.*?)</a>");
94
95 result = regex.Match(HtmlSource).Groups["value"].Value;
96
97 return result;
98 }
99
100 public string GetLZArticle(string HtmlSource)
101 {
102
103 string result = string.Empty;
104 string authorName = GetAuthorName(HtmlSource);
105
106 Regex regex = new Regex("<td valign=\"top\" class=\"post_info .*?\" data-username=\"" + authorName + "\".*?>.*?<div class=\"post_body\">(?<value>.*?)</div>.*?</td>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
107
108 for (int i = 0; i < regex.Matches(HtmlSource).Count; i++)
109 {
110 result += regex.Matches(HtmlSource)[i].Groups["value"].Value;
111 result += "--------------------分隔线--------------------";
112 }
113 return result.Trim().Replace("<br />","\r\n");
114 }
代码都在这里了。