深度优先

这个家伙好懒,除了文章什么都没留下

0%

【.Net】一个简单的爬虫

写了一个小爬虫,把CSDN上发表的博客全都备份了下。

获取发表过的文章信息,存入到数据库。

C#中用 AngleSharp这个组件就可以像用linq一样就行html标签的查询操作。

所以从html里获取需要的内容是非常方便的

具体代码,随便写的:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
using AngleSharp.Parser.Html;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;

namespace Crawler
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine("Hello World!");
//for (int i = 1; i <= 10; i++)
//{
// fuac("https://blog.csdn.net/qq_32688731/article/list/"+i);
// Console.WriteLine(count);
//}
//bds284289328_dbEntities1 db = new bds284289328_dbEntities1();
//db.CSDN_Article.ToList().ForEach(r =>
//{
// ff(r.Link, r.Id);
// System.Threading.Thread.Sleep(500);
//});
f("https://user.qzone.qq.com/1439084907", -1);

Console.ReadLine();
}

static int count = 0;
static void f(string url, int id)
{
using (HttpClient http = new HttpClient())
{
var htmlString = http.GetStringAsync(url).Result;
HtmlParser htmlParser = new HtmlParser();
var data = htmlParser.Parse(htmlString)
.QuerySelectorAll("body")
.Select(t => t)
.ToList();
}
}
static void ff(string url,int id)
{
using (HttpClient http = new HttpClient())
{
var htmlString = http.GetStringAsync(url).Result;
HtmlParser htmlParser = new HtmlParser();
var data = htmlParser.Parse(htmlString)
.QuerySelectorAll("#main")
.Select(t => new details_itme()
{
ArticleType = t.QuerySelectorAll(".subItem_t a").Length == 0 ? "-1": t.QuerySelectorAll(".subItem_t a").FirstOrDefault().GetAttribute("href"),

ArticleContent = t.QuerySelectorAll(".article_content").FirstOrDefault().InnerHtml.Trim().Replace("\n", ""),
ArticleDetails = t.QuerySelectorAll("#article_details").FirstOrDefault().InnerHtml.Trim().Replace("\n", ""),
})
.ToList();
bds284289328_dbEntities1 db = new bds284289328_dbEntities1();
foreach (var item in data)
{
CSDN_Details cSDN_Details = new CSDN_Details();
//https://blog.csdn.net/qq_32688731/article/category/6568994
string temp = item.ArticleType.Substring(item.ArticleType.LastIndexOf("/")+1);
cSDN_Details.ArticleType =int.Parse(temp);
cSDN_Details.ArticleContent = item.ArticleContent;
cSDN_Details.ArticleDetails = item.ArticleDetails;

cSDN_Details.ArticleListId = id;

db.CSDN_Details.Add(cSDN_Details);
count++;
Console.WriteLine(count);
}
db.SaveChanges();
}
}


static void fuac(string url)
{
using (HttpClient http = new HttpClient())
{
var htmlString = http.GetStringAsync(url).Result;
HtmlParser htmlParser = new HtmlParser();
var data = htmlParser.Parse(htmlString)
.QuerySelectorAll(".list_item")
.Select(t => new list_item()
{
article_type = t.QuerySelectorAll(".ico_type_Original").FirstOrDefault() != null ? 1 : 0,


article_link = t.QuerySelectorAll(".link_title a").FirstOrDefault().GetAttribute("href"),
article_title = t.QuerySelectorAll(".link_title").FirstOrDefault().TextContent,
article_description = t.QuerySelectorAll(".article_description").FirstOrDefault().TextContent,

article_postdate = t.QuerySelectorAll(".link_postdate").FirstOrDefault().TextContent,
article_view = t.QuerySelectorAll(".link_view").FirstOrDefault().TextContent,
article_comments = t.QuerySelectorAll(".link_comments").FirstOrDefault().TextContent,
})
.ToList();
bds284289328_dbEntities1 db = new bds284289328_dbEntities1();
foreach (var item in data)
{
CSDN_Article cSDN_Article = new CSDN_Article();
cSDN_Article.Type = item.article_type;

cSDN_Article.Link = item.article_link.Trim().Replace("\n", "");
cSDN_Article.Title = item.article_title.Trim().Replace("\n", "");
cSDN_Article.Description = item.article_description.Trim().Replace("\n", "");

cSDN_Article.Postdate = Convert.ToDateTime(item.article_postdate.Trim().Replace("\n", ""));
cSDN_Article.ViewCount = Convert.ToInt32(item.article_view.Trim().Replace("\n", "").Replace("阅读(", "").Replace(")", ""));
cSDN_Article.Comments = Convert.ToInt32(item.article_comments.Trim().Replace("\n", "").Replace("评论(", "").Replace(")", ""));
db.CSDN_Article.Add(cSDN_Article);
System.Threading.Thread.Sleep(100);
count++;
}
db.SaveChanges();
}
}
}

class list_item
{
//文章类型 1原创 0转载
public int article_type { get; set; }

//文章连接
public string article_link { get; set; }

//文章标题
public string article_title { get; set; }

//文章描述
public string article_description { get; set; }
//发表时间
public string article_postdate { get; set; }

//阅读次数
public string article_view { get; set; }

//评论次数
public string article_comments { get; set; }
}

class details_itme
{
//文章类别
public string ArticleType { get; set; }

//文章内容
public string ArticleContent { get; set; }

//文章详情
public string ArticleDetails { get; set; }

//列表外键
public string ArticleListId { get; set; }
}
}