1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
| using AngleSharp.Parser.Html; using System; using System.Collections.Generic; using System.Linq; using System.Net.Http; using System.Text; using System.Threading.Tasks;
namespace Crawler { class Program { static void Main(string[] args) { Console.WriteLine("Hello World!"); //for (int i = 1; i <= 10; i++) //{ // fuac("https://blog.csdn.net/qq_32688731/article/list/"+i); // Console.WriteLine(count); //} //bds284289328_dbEntities1 db = new bds284289328_dbEntities1(); //db.CSDN_Article.ToList().ForEach(r => //{ // ff(r.Link, r.Id); // System.Threading.Thread.Sleep(500); //}); f("https://user.qzone.qq.com/1439084907", -1); Console.ReadLine(); }
static int count = 0; static void f(string url, int id) { using (HttpClient http = new HttpClient()) { var htmlString = http.GetStringAsync(url).Result; HtmlParser htmlParser = new HtmlParser(); var data = htmlParser.Parse(htmlString) .QuerySelectorAll("body") .Select(t => t) .ToList(); } } static void ff(string url,int id) { using (HttpClient http = new HttpClient()) { var htmlString = http.GetStringAsync(url).Result; HtmlParser htmlParser = new HtmlParser(); var data = htmlParser.Parse(htmlString) .QuerySelectorAll("#main") .Select(t => new details_itme() { ArticleType = t.QuerySelectorAll(".subItem_t a").Length == 0 ? "-1": t.QuerySelectorAll(".subItem_t a").FirstOrDefault().GetAttribute("href"),
ArticleContent = t.QuerySelectorAll(".article_content").FirstOrDefault().InnerHtml.Trim().Replace("\n", ""), ArticleDetails = t.QuerySelectorAll("#article_details").FirstOrDefault().InnerHtml.Trim().Replace("\n", ""), }) .ToList(); bds284289328_dbEntities1 db = new bds284289328_dbEntities1(); foreach (var item in data) { CSDN_Details cSDN_Details = new CSDN_Details(); //https://blog.csdn.net/qq_32688731/article/category/6568994 string temp = item.ArticleType.Substring(item.ArticleType.LastIndexOf("/")+1); cSDN_Details.ArticleType =int.Parse(temp); cSDN_Details.ArticleContent = item.ArticleContent; cSDN_Details.ArticleDetails = item.ArticleDetails;
cSDN_Details.ArticleListId = id;
db.CSDN_Details.Add(cSDN_Details); count++; Console.WriteLine(count); } db.SaveChanges(); } }
static void fuac(string url) { using (HttpClient http = new HttpClient()) { var htmlString = http.GetStringAsync(url).Result; HtmlParser htmlParser = new HtmlParser(); var data = htmlParser.Parse(htmlString) .QuerySelectorAll(".list_item") .Select(t => new list_item() { article_type = t.QuerySelectorAll(".ico_type_Original").FirstOrDefault() != null ? 1 : 0,
article_link = t.QuerySelectorAll(".link_title a").FirstOrDefault().GetAttribute("href"), article_title = t.QuerySelectorAll(".link_title").FirstOrDefault().TextContent, article_description = t.QuerySelectorAll(".article_description").FirstOrDefault().TextContent,
article_postdate = t.QuerySelectorAll(".link_postdate").FirstOrDefault().TextContent, article_view = t.QuerySelectorAll(".link_view").FirstOrDefault().TextContent, article_comments = t.QuerySelectorAll(".link_comments").FirstOrDefault().TextContent, }) .ToList(); bds284289328_dbEntities1 db = new bds284289328_dbEntities1(); foreach (var item in data) { CSDN_Article cSDN_Article = new CSDN_Article(); cSDN_Article.Type = item.article_type;
cSDN_Article.Link = item.article_link.Trim().Replace("\n", ""); cSDN_Article.Title = item.article_title.Trim().Replace("\n", ""); cSDN_Article.Description = item.article_description.Trim().Replace("\n", "");
cSDN_Article.Postdate = Convert.ToDateTime(item.article_postdate.Trim().Replace("\n", "")); cSDN_Article.ViewCount = Convert.ToInt32(item.article_view.Trim().Replace("\n", "").Replace("阅读(", "").Replace(")", "")); cSDN_Article.Comments = Convert.ToInt32(item.article_comments.Trim().Replace("\n", "").Replace("评论(", "").Replace(")", "")); db.CSDN_Article.Add(cSDN_Article); System.Threading.Thread.Sleep(100); count++; } db.SaveChanges(); } } }
class list_item { //文章类型 1原创 0转载 public int article_type { get; set; }
//文章连接 public string article_link { get; set; }
//文章标题 public string article_title { get; set; }
//文章描述 public string article_description { get; set; } //发表时间 public string article_postdate { get; set; }
//阅读次数 public string article_view { get; set; }
//评论次数 public string article_comments { get; set; } }
class details_itme { //文章类别 public string ArticleType { get; set; }
//文章内容 public string ArticleContent { get; set; }
//文章详情 public string ArticleDetails { get; set; }
//列表外键 public string ArticleListId { get; set; } } }
|