网站的版式,外贸营销网站建站,这么开网站,dede做漫画网站的案例前言大家好#xff0c;我是晓晨。许久没有更新博客了#xff0c;今天给大家带来一篇干货型文章#xff0c;一个每隔5分钟抓取博客园首页文章信息并在第二天的上午9点发送到你的邮箱的小工具。比如我在2018年2月14日#xff0c;9点来到公司我就会收到一封邮件#xff0c;是… 前言大家好我是晓晨。许久没有更新博客了今天给大家带来一篇干货型文章一个每隔5分钟抓取博客园首页文章信息并在第二天的上午9点发送到你的邮箱的小工具。比如我在2018年2月14日9点来到公司我就会收到一封邮件是2018年2月13日的博客园首页的文章信息。写这个小工具的初衷是一直有看博客的习惯但是最近由于各种原因吧可能几天都不会看一下博客要是中途错过了什么好文可是十分心疼的哈哈。所以做了个工具每天归档发到邮箱妈妈再也不会担心我错过好的文章了。为什么只抓取首页因为博客园首页文章的质量相对来说高一些。准备作为一个持续运行的工具没有日志记录怎么行我准备使用的是NLog来记录日志它有个日志归档功能非常不错。在http请求中由于网络问题吧可能会出现失败的情况这里我使用Polly来进行Retry。使用HtmlAgilityPack来解析网页需要对xpath有一定了解。下面是详细说明组件名用途githubNLog记录日志https://github.com/NLog/NLogPolly当http请求失败进行重试https://github.com/App-vNext/PollyHtmlAgilityPack网页解析https://github.com/zzzprojects/html-agility-packMailKit发送邮件https://github.com/jstedfast/MailKit有不了解的组件可以通过访问github获取资料。关于发送邮件感谢下面的园友提供的资料https://www.cnblogs.com/qulianqing/p/7413640.htmlhttp://www.cnblogs.com/rocketRobin/p/8337055.html获取解析博客园首页数据我是用的是HttpWebRequest来进行http请求下面分享一下我简单封装的类库using System;using System.IO;using System.Net;using System.Text;namespace CnBlogSubscribeTool{ /// summary/// Simple Http Request Class/// .NET Framework 4.0/// Author:stulzq/// CreatedTime:2017-12-12 15:54:47/// /summarypublic class HttpUtil{ static HttpUtil() { //Set connection limit ,Default limit is 2ServicePointManager.DefaultConnectionLimit 1024;} /// summary/// Default Timeout 20s/// /summarypublic static int DefaultTimeout 20000; /// summary/// Is Auto Redirect/// /summarypublic static bool DefalutAllowAutoRedirect true; /// summary/// Default Encoding/// /summarypublic static Encoding DefaultEncoding Encoding.UTF8; /// summary/// Default UserAgent/// /summarypublic static string DefaultUserAgent Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36; /// summary/// Default Referer/// /summarypublic static string DefaultReferer ; /// summary/// httpget request/// /summary/// param nameurlInternet Address/param/// returnsstring/returnspublic static string GetString(string url) { var stream GetStream(url); string result; using (StreamReader sr new StreamReader(stream)){result sr.ReadToEnd();} return result;} /// summary/// httppost request/// /summary/// param nameurlInternet Address/param/// param namepostDataPost request data/param/// returnsstring/returnspublic static string PostString(string url, string postData) { var stream PostStream(url, postData); string result; using (StreamReader sr new StreamReader(stream)){result sr.ReadToEnd();} return result;} /// summary/// Create Response/// /summary/// param nameurl/param/// param namepostIs post Request/param/// param namepostDataPost request data/param/// returns/returnspublic static WebResponse CreateResponse(string url, bool post, string postData ) { var httpWebRequest WebRequest.CreateHttp(url);httpWebRequest.Timeout DefaultTimeout;httpWebRequest.AllowAutoRedirect DefalutAllowAutoRedirect;httpWebRequest.UserAgent DefaultUserAgent;httpWebRequest.Referer DefaultReferer; if (post){ var data DefaultEncoding.GetBytes(postData);httpWebRequest.Method POST;httpWebRequest.ContentType application/x-www-form-urlencoded;charsetutf-8;httpWebRequest.ContentLength data.Length; using (var stream httpWebRequest.GetRequestStream()){stream.Write(data, 0, data.Length);}} try{ var response httpWebRequest.GetResponse(); return response;} catch (Exception e){ throw new Exception(string.Format(Request error,url:{0},IsPost:{1},Data:{2},Message:{3}, url, post, postData, e.Message), e);}} /// summary/// http get request/// /summary/// param nameurl/param/// returnsResponse Stream/returnspublic static Stream GetStream(string url) { var stream CreateResponse(url, false).GetResponseStream(); if (stream null){ throw new Exception(Response error,the response stream is null);} else{ return stream;}} /// summary/// http post request/// /summary/// param nameurl/param/// param namepostDatapost data/param/// returnsResponse Stream/returnspublic static Stream PostStream(string url, string postData) { var stream CreateResponse(url, true, postData).GetResponseStream(); if (stream null){ throw new Exception(Response error,the response stream is null);} else{ return stream;}}}
}获取首页数据string res HttpUtil.GetString(https://www.cnblogs.com);解析数据我们成功获取到了html但是怎么提取我们需要的信息文章标题、地址、摘要、作者、发布时间呢。这里就亮出了我们的利剑HtmlAgilityPack他是一个可以根据xpath来解析网页的组件。载入我们前面获取的htmlHtmlDocument doc new HtmlDocument();
doc.LoadHtml(html);从上图中我们可以看出每条文章所有信息都在一个class为post_item的div里我们先获取所有的classpost_item的div//获取所有文章数据项var itemBodys doc.DocumentNode.SelectNodes(//div[classpost_item_body]);我们继续分析可以看出文章的标题在classpost_item_body的div下面的h3标签下的a标签摘要信息在classpost_item_summary的p标签里面发布时间和作者在classpost_item_foot的div里分析完毕我们可以取出我们想要的数据了foreach (var itemBody in itemBodys)
{ //标题元素var titleElem itemBody.SelectSingleNode(h3/a); //获取标题var title titleElem?.InnerText; //获取urlvar url titleElem?.Attributes[href]?.Value; //摘要元素var summaryElem itemBody.SelectSingleNode(p[classpost_item_summary]); //获取摘要var summary summaryElem?.InnerText.Replace(\r\n, ).Trim(); //数据项底部元素var footElem itemBody.SelectSingleNode(div[classpost_item_foot]); //获取作者var author footElem?.SelectSingleNode(a)?.InnerText; //获取文章发布时间var publishTime Regex.Match(footElem?.InnerText, \\d-\\d-\\d \\d:\\d).Value;Console.WriteLine($标题{title});Console.WriteLine($网址{url});Console.WriteLine($摘要{summary});Console.WriteLine($作者{author});Console.WriteLine($发布时间{publishTime});Console.WriteLine(--------------华丽的分割线---------------);
}运行一下我们成功的获取了我们想要的信息。现在我们定义一个Blog对象将它们装起来。public class Blog{ /// summary/// 标题/// /summarypublic string Title { get; set; } /// summary/// 博文url/// /summarypublic string Url { get; set; } /// summary/// 摘要/// /summarypublic string Summary { get; set; } /// summary/// 作者/// /summarypublic string Author { get; set; } /// summary/// 发布时间/// /summarypublic DateTime PublishTime { get; set; }
}http请求失败重试我们使用Polly在我们的http请求失败时进行重试设置为重试3次。//初始化重试器_retryTwoTimesPolicy Policy.HandleException().Retry(3, (ex, count) {_logger.Error(Excuted Failed! Retry {0}, count);_logger.Error(Exeption from {0}, ex.GetType().Name);});测试一下可以看到当遇到exception是Polly会帮我们重试三次如果三次重试都失败了那么会放弃。发送邮件使用MailKit来进行邮件发送它支持IMAPPOP3和SMTP协议并且是跨平台的十分优秀。下面是根据前面园友的分享自己封装的一个类库using System.Collections.Generic;using CnBlogSubscribeTool.Config;using MailKit.Net.Smtp;using MimeKit;namespace CnBlogSubscribeTool{ /// summary/// send email/// /summarypublic class MailUtil{ private static bool SendMail(MimeMessage mailMessage,MailConfig config) { try{ var smtpClient new SmtpClient();smtpClient.Timeout 10 * 1000; //设置超时时间smtpClient.Connect(config.Host, config.Port, MailKit.Security.SecureSocketOptions.None);//连接到远程smtp服务器smtpClient.Authenticate(config.Address, config.Password);smtpClient.Send(mailMessage);//发送邮件smtpClient.Disconnect(true); return true;} catch{ throw;}} /// summary///发送邮件/// /summary/// param nameconfig配置/param/// param namereceives接收人/param/// param namesender发送人/param/// param namesubject标题/param/// param namebody内容/param/// param nameattachments附件/param/// param namefileName附件名/param/// returns/returnspublic static bool SendMail(MailConfig config,Liststring receives, string sender, string subject, string body, byte[] attachments null,string fileName) { var fromMailAddress new MailboxAddress(config.Name, config.Address); var mailMessage new MimeMessage();mailMessage.From.Add(fromMailAddress); foreach (var add in receives){ var toMailAddress new MailboxAddress(add);mailMessage.To.Add(toMailAddress);} if (!string.IsNullOrEmpty(sender)){ var replyTo new MailboxAddress(config.Name, sender);mailMessage.ReplyTo.Add(replyTo);} var bodyBuilder new BodyBuilder() { HtmlBody body }; //附件if (attachments ! null){ if (string.IsNullOrEmpty(fileName)){fileName 未命名文件.txt;} var attachment bodyBuilder.Attachments.Add(fileName, attachments); //解决中文文件名乱码var charset GB18030;attachment.ContentType.Parameters.Clear();attachment.ContentDisposition.Parameters.Clear();attachment.ContentType.Parameters.Add(charset, name, fileName);attachment.ContentDisposition.Parameters.Add(charset, filename, fileName); //解决文件名不能超过41字符foreach (var param in attachment.ContentDisposition.Parameters)param.EncodingMethod ParameterEncodingMethod.Rfc2047; foreach (var param in attachment.ContentType.Parameters)param.EncodingMethod ParameterEncodingMethod.Rfc2047;}mailMessage.Body bodyBuilder.ToMessageBody();mailMessage.Subject subject; return SendMail(mailMessage, config);}}
}测试一下说明关于抓取数据和发送邮件的调度程序异常退出的数据处理等等在此我就不详细说明了有兴趣的看源码文末有github地址抓取数据是增量更新的。不用RSS订阅的原因是RSS更新比较慢。完整的程序运行截图每发送一次邮件程序就会将记录时间调整到今天的9点然后每次抓取数据之后就会判断当前时间减去记录时间是否大于等于24小时如果符合就发送邮件并且更新记录时间。收到的邮件截图截图中的邮件标题为13日但是邮件内容为14日是因为我为了演示效果将今天14日的数据copy到了13日的数据里面不要被误导了。还提供一个附件便于收集整理好了介绍完毕我自己已经将这个小工具部署到服务器想要享受这个服务的可以在评论留下邮箱手动滑稽。github:https://github.com/stulzq/CnBlogSubscribeTool 如果你喜欢欢迎来个star原文地址:http://www.cnblogs.com/stulzq/p/8448183.html.NET社区新闻深度好文欢迎访问公众号文章汇总 http://www.csharpkit.com