深度优先

这个家伙好懒,除了文章什么都没留下

0%

转自:http://www.cnblogs.com/soundcode/p/6247531.html

汉字转拼音貌似一直是C#开发的一个难题,无论什么方案都有一定的bug,之前使用了两种方案。

1.Chinese2Spell.cs 一些不能识别的汉字全部转为Z

2.Microsoft Visual Studio International Feature Pack 1.0 连”广”、“区”都不能转,很让人失望。

这些都是2010年以前的方案,至少还有大侠在为汉字转拼音不断努力着,目前发现最完美的就是NPINYIN,在googlecode可以看到它的开源项目,http://code.google.com/p/npinyin/

不能识别的字很少,而且还在不断维护更新,日趋完美,推荐大家使用。

下载地址

dll:http://files.cnblogs.com/files/guohu/NPinyin-0.2.4588.20158-bin.zip

源码:http://files.cnblogs.com/files/guohu/NPinyin-0.2.x-source_code.zip

v0.2.x的变化

  • 1、增加对不同编码格式文本的支持,同时增加编码转换方法Pinyin.ConvertEncoding
  • 2、重构单字符拼音的获取,未找到拼音时返回字符本身. 汪思言 2012年7月23日晚

将中文转换成拼音全文和首字母的.net 组件。示例:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
using System;
using System.Collections.Generic;
using System.Text;
using NPinyin;

namespace NPinyinTest
{
  class Program
  {
    static void Main(string[] args)
    {
      string[] maxims = new string[]{
        "事常与人违,事总在人为",
        "骏马是跑出来的,强兵是打出来的",
        "驾驭命运的舵是奋斗。不抱有一丝幻想,不放弃一点机会,不停止一日努力。 ",
        "如果惧怕前面跌宕的山岩,生命就永远只能是死水一潭",
        "懦弱的人只会裹足不前,莽撞的人只能引为烧身,只有真正勇敢的人才能所向披靡"
      };

      string[] medicines = new string[] {
        "聚维酮碘溶液",
        "开塞露",
        "炉甘石洗剂",
        "苯扎氯铵贴",
        "鱼石脂软膏",
        "莫匹罗星软膏",
        "红霉素软膏",
        "氢化可的松软膏",
        "曲安奈德软膏",
        "丁苯羟酸乳膏",
        "双氯芬酸二乙胺乳膏",
        "冻疮膏",
        "克霉唑软膏",
        "特比奈芬软膏",
        "酞丁安软膏",
        "咪康唑软膏、栓剂",
        "甲硝唑栓",
        "复方莪术油栓"
      };

      Console.WriteLine("UTF8句子拼音:");
      foreach (string s in maxims)
      {
        Console.WriteLine("汉字:{0}\n拼音:{1}\n", s, Pinyin.GetPinyin(s));
      }

      Encoding gb2312 = Encoding.GetEncoding("GB2312");
      Console.WriteLine("GB2312拼音简码:");
      foreach (string m in medicines)
      {
        string s = Pinyin.ConvertEncoding(m, Encoding.UTF8, gb2312);
        Console.WriteLine("药品:{0}\n简码:{1}\n", s, Pinyin.GetInitials(s, gb2312));
      }

      Console.ReadKey();
   
System;
using System.Collections.Generic;
using System.Text;
using NPinyin;

namespace NPinyinTest
{
  class Program
  {
    static void Main(string[] args)
    {
      string[] maxims = new string[]{
        "事常与人违,事总在人为",
        "骏马是跑出来的,强兵是打出来的",
        "驾驭命运的舵是奋斗。不抱有一丝幻想,不放弃一点机会,不停止一日努力。 ",
        "如果惧怕前面跌宕的山岩,生命就永远只能是死水一潭",
        "懦弱的人只会裹足不前,莽撞的人只能引为烧身,只有真正勇敢的人才能所向披靡"
      };

      string[] medicines = new string[] {
        "聚维酮碘溶液",
        "开塞露",
        "炉甘石洗剂",
        "苯扎氯铵贴",
        "鱼石脂软膏",
        "莫匹罗星软膏",
        "红霉素软膏",
        "氢化可的松软膏",
        "曲安奈德软膏",
        "丁苯羟酸乳膏",
        "双氯芬酸二乙胺乳膏",
        "冻疮膏",
        "克霉唑软膏",
        "特比奈芬软膏",
        "酞丁安软膏",
        "咪康唑软膏、栓剂",
        "甲硝唑栓",
        "复方莪术油栓"
      };

      Console.WriteLine("UTF8句子拼音:");
      foreach (string s in maxims)
      {
        Console.WriteLine("汉字:{0}\n拼音:{1}\n", s, Pinyin.GetPinyin(s));
      }

      Encoding gb2312 = Encoding.GetEncoding("GB2312");
      Console.WriteLine("GB2312拼音简码:");
      foreach (string m in medicines)
      {
        string s = Pinyin.ConvertEncoding(m, Encoding.UTF8, gb2312);
        Console.WriteLine("药品:{0}\n简码:{1}\n", s, Pinyin.GetInitials(s, gb2312));
      }

      Console.ReadKey();

闲来无聊爬了下全国的省市区乡镇居委会的信息,存入到数据。

以后做地址联动选择的时候可能用得着,这次可以精确到居委会

数据来源:国家统计局 2016年统计用区划代码和城乡划分代码(截止2016年07月31日)

http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html

具体代码,也是写的比较随意:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
using AngleSharp.Parser.Html;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;

namespace CrawlerArea
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine(DateTime.Now);
//省
//f("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html");
//市
//AreaDBEntities areaDBEntities = new AreaDBEntities();
//var data = areaDBEntities.AreaInfoes.ToList();
//foreach (var item in data)
//{
// string url = string.Format("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/{0}.html", item.Code);
// getCityInfo(url, item.Code);
// System.Threading.Thread.Sleep(50);
//}
//Console.WriteLine(DateTime.Now);

//区 / 县
//AreaDBEntities areaDBEntities = new AreaDBEntities();
//var data = areaDBEntities.AreaInfoes.ToList();
//foreach (var item in data)
//{
// string url = string.Format("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/{0}/{1}.html", item.PedarId, item.Code);
// getCountyInfo(url, item.PedarId, item.Code);
// System.Threading.Thread.Sleep(50);
//}
//Console.WriteLine(DateTime.Now);
////街道
//AreaDBEntities areaDBEntities = new AreaDBEntities();
//var data = areaDBEntities.AreaInfoes.Where(t => t.PedarId >= 1000).ToList();
//foreach (var item in data)
//{
// string temp = item.Code.ToString();
// string url = string.Format("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/{0}/{1}/{2}.html", temp.Substring(0, 2), temp.Substring(2, 2), item.Code);
// getStreetInfo(url, item.Code);
// System.Threading.Thread.Sleep(50);
//}
//Console.WriteLine(DateTime.Now);
//村委会
AreaDBEntities areaDBEntities = new AreaDBEntities();
var data = areaDBEntities.AreaInfoes.Where(t => t.Code.Length >= 9 ).ToList();
foreach (var item in data)
{
string temp = item.Code.ToString();
string url = string.Format("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/{0}/{1}/{2}/{3}.html", temp.Substring(0, 2), temp.Substring(2, 2), temp.Substring(4, 2), item.Code);
getCommitteeInfo(url, item.Code);
Console.WriteLine(item.Code+"----"+item.Name);
System.Threading.Thread.Sleep(200);
}
Console.WriteLine(DateTime.Now);
Console.WriteLine("OK");
Console.ReadKey();
}
/// <summary>
/// 居委会
/// </summary>
/// <param name="url"></param>
/// <param name="code"></param>
private static void getCommitteeInfo(string url, string code)
{
var htmlString = HttpGet(url);
HtmlParser htmlParser = new HtmlParser();
var data = htmlParser.Parse(htmlString)
.QuerySelectorAll(".villagetr")
.Select(t => t)
.ToList();

List<Node> list = new List<Node>();
foreach (var item in data)
{
var str = item.Children.ToList();
Node node = new Node();
node.code = str[0].InnerHtml;
node.area = str[2].InnerHtml;
list.Add(node);
}

AreaDBEntities areaDBEntities = new AreaDBEntities();
foreach (var item in list)
{
AreaInfo areaInfo = new AreaInfo();
areaInfo.Code = item.code;
areaInfo.Name = item.area;
areaInfo.PedarId =int.Parse( code);
areaDBEntities.AreaInfoes.Add(areaInfo);
}
areaDBEntities.SaveChanges();
}

/// <summary>
/// 街道
/// </summary>
/// <param name="url"></param>
/// <param name="code"></param>
//private static void getStreetInfo(string url, int? code)
//{
// var htmlString = HttpGet(url);
// HtmlParser htmlParser = new HtmlParser();
// var data = htmlParser.Parse(htmlString)
// .QuerySelectorAll(".towntr")
// .Select(t => t)
// .ToList();

// List<Node> list = new List<Node>();
// foreach (var item in data)
// {
// var area = htmlParser.Parse(item.InnerHtml)
// .QuerySelectorAll("a")
// .Select(t => t).ToList();

// foreach (var td in area)
// {
// Node node = new Node();
// node.code = td.GetAttribute("href");
// node.area = td.TextContent;
// list.Add(node);
// }
// }

// AreaDBEntities areaDBEntities = new AreaDBEntities();
// int k = 0;
// foreach (var item in list)
// {
// if (k % 2 != 0)
// {
// AreaInfo areaInfo = new AreaInfo();
// Console.WriteLine(item.code + "----" + item.area);
// string code1 = item.code.Substring(item.code.IndexOf("/") + 1, 9);
// areaInfo.Code = int.Parse(code1);
// areaInfo.Name = item.area;
// areaInfo.PedarId = code;
// areaDBEntities.AreaInfoes.Add(areaInfo);
// }
// k++;
// }
// areaDBEntities.SaveChanges();
// Console.WriteLine();
//}

//private static void getCountyInfo(string url, int? PedarId, int? code)
//{
// if (PedarId == null) return;
// var htmlString = HttpGet(url);
// HtmlParser htmlParser = new HtmlParser();
// var data = htmlParser.Parse(htmlString)
// .QuerySelectorAll(".countytr")
// .Select(t => t)
// .ToList();

// List<Node> list = new List<Node>();
// foreach (var item in data)
// {
// var area = htmlParser.Parse(item.InnerHtml)
// .QuerySelectorAll("a")
// .Select(t => t).ToList();

// foreach (var td in area)
// {
// Node node = new Node();
// node.code = td.GetAttribute("href");
// node.area = td.TextContent;
// list.Add(node);
// }
// }

// AreaDBEntities areaDBEntities = new AreaDBEntities();
// int k = 0;
// foreach (var item in list)
// {
// if (k % 2 != 0)
// {
// AreaInfo areaInfo = new AreaInfo();
// Console.WriteLine(item.code + "----" + item.area);
// string code1 = item.code.Substring(item.code.IndexOf("/") + 1, 6);
// areaInfo.Code = int.Parse(code1);
// areaInfo.Name = item.area;
// areaInfo.PedarId = code;
// areaDBEntities.AreaInfoes.Add(areaInfo);
// }
// k++;
// }
// areaDBEntities.SaveChanges();
// Console.WriteLine();
//}

//private static void getCityInfo(string url, int? PedarId)
//{
// var htmlString = HttpGet(url);
// HtmlParser htmlParser = new HtmlParser();
// var data = htmlParser.Parse(htmlString)
// .QuerySelectorAll(".citytr")
// .Select(t => t)
// .ToList();

// List<Node> list = new List<Node>();
// foreach (var item in data)
// {
// var area = htmlParser.Parse(item.InnerHtml)
// .QuerySelectorAll("a")
// .Select(t => t).ToList();

// foreach (var td in area)
// {
// Node node = new Node();
// node.code = td.GetAttribute("href");
// node.area = td.TextContent;
// list.Add(node);
// }
// }

// AreaDBEntities areaDBEntities = new AreaDBEntities();
// int k = 0;
// foreach (var item in list)
// {
// if (k % 2 != 0)
// {
// AreaInfo areaInfo = new AreaInfo();
// Console.WriteLine(item.code + "----" + item.area);
// string code = item.code.Substring(item.code.IndexOf("/") + 1, 4);
// areaInfo.Code = int.Parse(code);
// areaInfo.Name = item.area;
// areaInfo.PedarId = PedarId;
// areaDBEntities.AreaInfoes.Add(areaInfo);
// }
// k++;
// }
// areaDBEntities.SaveChanges();
// Console.WriteLine();
//}

public static string HttpGet(string url)
{
try
{
Encoding encoding = Encoding.UTF8;
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "GET";
request.Accept = "text/html, application/xhtml+xml, */*";
request.ContentType = "application/json";

HttpWebResponse response = (HttpWebResponse)request.GetResponse();
using (StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.Default))
{
return reader.ReadToEnd();
}
}
catch (Exception ex)
{
//log.Error("WeChatHelper", ex);
return null;
}
}
////得到省的信息
//static void f(string url)
//{
// var htmlString = HttpGet(url);
// HtmlParser htmlParser = new HtmlParser();
// var data = htmlParser.Parse(htmlString)
// .QuerySelectorAll(".provincetr")
// .Select(t => t)
// .ToList();

// List<Node> list = new List<Node>();
// foreach (var item in data)
// {
// var area = htmlParser.Parse(item.InnerHtml)
// .QuerySelectorAll("a")
// .Select(t => t).ToList();

// foreach (var td in area)
// {
// Node node = new Node();
// node.code = td.GetAttribute("href");
// node.area = td.TextContent;
// list.Add(node);
// }
// }
// AreaDBEntities areaDBEntities = new AreaDBEntities();
// foreach (var item in list)
// {
// AreaInfo areaInfo = new AreaInfo();
// Console.WriteLine(item.code + "----" + item.area);
// areaInfo.Code = int.Parse(item.code.Substring(0, item.code.IndexOf(".")));
// areaInfo.Name = item.area;
// areaInfo.PedarId = null;
// areaDBEntities.AreaInfoes.Add(areaInfo);
// }
// areaDBEntities.SaveChanges();
// Console.WriteLine();
// //}
//}
}
class Node
{
public string code { get; set; }
public string area { get; set; }
}

class td
{
public string td1 { get; set; }
}
}

等下会将生成数据库脚本分享出来,可以私聊我

写了一个小爬虫,把CSDN上发表的博客全都备份了下。

获取发表过的文章信息,存入到数据库。

C#中用 AngleSharp这个组件就可以像用linq一样就行html标签的查询操作。

所以从html里获取需要的内容是非常方便的

具体代码,随便写的:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
using AngleSharp.Parser.Html;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;

namespace Crawler
{
class Program
{
static void Main(string[] args)
{
Console.WriteLine("Hello World!");
//for (int i = 1; i <= 10; i++)
//{
// fuac("https://blog.csdn.net/qq_32688731/article/list/"+i);
// Console.WriteLine(count);
//}
//bds284289328_dbEntities1 db = new bds284289328_dbEntities1();
//db.CSDN_Article.ToList().ForEach(r =>
//{
// ff(r.Link, r.Id);
// System.Threading.Thread.Sleep(500);
//});
f("https://user.qzone.qq.com/1439084907", -1);

Console.ReadLine();
}

static int count = 0;
static void f(string url, int id)
{
using (HttpClient http = new HttpClient())
{
var htmlString = http.GetStringAsync(url).Result;
HtmlParser htmlParser = new HtmlParser();
var data = htmlParser.Parse(htmlString)
.QuerySelectorAll("body")
.Select(t => t)
.ToList();
}
}
static void ff(string url,int id)
{
using (HttpClient http = new HttpClient())
{
var htmlString = http.GetStringAsync(url).Result;
HtmlParser htmlParser = new HtmlParser();
var data = htmlParser.Parse(htmlString)
.QuerySelectorAll("#main")
.Select(t => new details_itme()
{
ArticleType = t.QuerySelectorAll(".subItem_t a").Length == 0 ? "-1": t.QuerySelectorAll(".subItem_t a").FirstOrDefault().GetAttribute("href"),

ArticleContent = t.QuerySelectorAll(".article_content").FirstOrDefault().InnerHtml.Trim().Replace("\n", ""),
ArticleDetails = t.QuerySelectorAll("#article_details").FirstOrDefault().InnerHtml.Trim().Replace("\n", ""),
})
.ToList();
bds284289328_dbEntities1 db = new bds284289328_dbEntities1();
foreach (var item in data)
{
CSDN_Details cSDN_Details = new CSDN_Details();
//https://blog.csdn.net/qq_32688731/article/category/6568994
string temp = item.ArticleType.Substring(item.ArticleType.LastIndexOf("/")+1);
cSDN_Details.ArticleType =int.Parse(temp);
cSDN_Details.ArticleContent = item.ArticleContent;
cSDN_Details.ArticleDetails = item.ArticleDetails;

cSDN_Details.ArticleListId = id;

db.CSDN_Details.Add(cSDN_Details);
count++;
Console.WriteLine(count);
}
db.SaveChanges();
}
}


static void fuac(string url)
{
using (HttpClient http = new HttpClient())
{
var htmlString = http.GetStringAsync(url).Result;
HtmlParser htmlParser = new HtmlParser();
var data = htmlParser.Parse(htmlString)
.QuerySelectorAll(".list_item")
.Select(t => new list_item()
{
article_type = t.QuerySelectorAll(".ico_type_Original").FirstOrDefault() != null ? 1 : 0,


article_link = t.QuerySelectorAll(".link_title a").FirstOrDefault().GetAttribute("href"),
article_title = t.QuerySelectorAll(".link_title").FirstOrDefault().TextContent,
article_description = t.QuerySelectorAll(".article_description").FirstOrDefault().TextContent,

article_postdate = t.QuerySelectorAll(".link_postdate").FirstOrDefault().TextContent,
article_view = t.QuerySelectorAll(".link_view").FirstOrDefault().TextContent,
article_comments = t.QuerySelectorAll(".link_comments").FirstOrDefault().TextContent,
})
.ToList();
bds284289328_dbEntities1 db = new bds284289328_dbEntities1();
foreach (var item in data)
{
CSDN_Article cSDN_Article = new CSDN_Article();
cSDN_Article.Type = item.article_type;

cSDN_Article.Link = item.article_link.Trim().Replace("\n", "");
cSDN_Article.Title = item.article_title.Trim().Replace("\n", "");
cSDN_Article.Description = item.article_description.Trim().Replace("\n", "");

cSDN_Article.Postdate = Convert.ToDateTime(item.article_postdate.Trim().Replace("\n", ""));
cSDN_Article.ViewCount = Convert.ToInt32(item.article_view.Trim().Replace("\n", "").Replace("阅读(", "").Replace(")", ""));
cSDN_Article.Comments = Convert.ToInt32(item.article_comments.Trim().Replace("\n", "").Replace("评论(", "").Replace(")", ""));
db.CSDN_Article.Add(cSDN_Article);
System.Threading.Thread.Sleep(100);
count++;
}
db.SaveChanges();
}
}
}

class list_item
{
//文章类型 1原创 0转载
public int article_type { get; set; }

//文章连接
public string article_link { get; set; }

//文章标题
public string article_title { get; set; }

//文章描述
public string article_description { get; set; }
//发表时间
public string article_postdate { get; set; }

//阅读次数
public string article_view { get; set; }

//评论次数
public string article_comments { get; set; }
}

class details_itme
{
//文章类别
public string ArticleType { get; set; }

//文章内容
public string ArticleContent { get; set; }

//文章详情
public string ArticleDetails { get; set; }

//列表外键
public string ArticleListId { get; set; }
}
}