当前位置：首页 > news >正文

成都哪里好玩适合年轻人30分钟seo网站

news 2025/11/21 17:58:32

成都哪里好玩适合年轻人,30分钟seo网站,最简洁 wordpress主题,专业建设计划文章目录 1 前言2 网络爬虫2.1 构造自己的Scrapy爬虫2.1.1 items.py2.1.2 spiders子目录2.1.3 pipelines.py 2.2 构造可接受参数的Scrapy爬虫2.3 运行Scrapy爬虫2.3.1 在命令行运行2.3.2 在程序中调用 2.4 运行Scrapy的一些要点 3 大规模非结构化数据的存储与分析4 全部代码 1 … 文章目录 1 前言2 网络爬虫2.1 构造自己的Scrapy爬虫2.1.1 items.py2.1.2 spiders子目录2.1.3 pipelines.py 2.2 构造可接受参数的Scrapy爬虫2.3 运行Scrapy爬虫2.3.1 在命令行运行2.3.2 在程序中调用 2.4 运行Scrapy的一些要点 3 大规模非结构化数据的存储与分析4 全部代码 1 前言介绍几种常见的数据收集、存储、组织以及分析的方法和工具首先介绍如何构造自己的网络爬虫从网上抓取内容并将其中按照一定结构组织的信息抽取出来然后介绍如何使用ElasticSearch来有效地存储、组织和查询非结构化数据最后简要介绍和使用Spark对大规模的非结构化数据进行初步分析的方法 2 网络爬虫 2.1 构造自己的Scrapy爬虫在终端输入scrapy startproject money163会自动生成一个同名的子目录和一个scrapy.cfg配置文件有两个init文件都是空白的暂时不用管将经历放在items.py、settings.py、pipelines.py和将要在spiders子目录下生成的爬虫程序上基本结构建立起来之后需要按照说明的步骤一次完成对内容抽取爬虫目标和行为以及数据操作的定义每一个定义都对应一个文件。 2.1.1 items.py 在这个文件里面定义需要抽取的内容这基本上是通过定义一个继承于scrapy.Item的内容类来完成的每一个内容都属于scrapy.Field()定义非常简单即内容名称 scrapy.Field() 2.1.2 spiders子目录在spiders子目录下新建一个python文件假设命名为money_spider.py这个文件比较复杂可以继承不同的类来定义首先使用Scrapy的CrawlSpider类定义三个内容一是爬虫的名字二是目标网站包括爬取模式和对返回链接的过滤等三是返回的对象按照其结构抽取所需要的数据在money_spider.py文件中输入以下代码注意将Stock163换成money163 # encoding: utf-8 import scrapy import re from scrapy.selector import Selector from stock163.items import Stock163Item from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Ruleclass ExampleSpider(CrawlSpider):name stocknews #爬虫的名字为 stocknewsallowed_domains [money.163.com]#设置允许爬取的域名def __init__(self, id600000, page0, *args, **kwargs):#初始化方法设置了一些初始参数包括 id默认为 600000、page默认为 0以及其他可能传递的参数。# allowrule /%s/%s\d/\d/* % (year, month)# allowrule /%s/%s%s/\d/* % (year, month, day) #这个规则匹配类似 /2022/11/25/ 这样的日期结构allowrule r/\d/\d/\d/*# 定义了一个正则表达式用于匹配新闻链接的规则。数字数字数字任意字符self.counter 0 # 初始化一个计数器可能用于跟踪爬取的新闻数量。self.stock_id id # 保存股票IDself.start_urls [http://quotes.money.163.com/f10/gsxw_%s,%s.html % (id, page)] # 设置初始爬取的URL这里使用了 id 和 page 参数构造URL。ExampleSpider.rules (Rule(LinkExtractor(allowallowrule), callbackparse_news, followFalse),)# 定义了爬取规则。这里使用了 LinkExtractor 来提取链接通过正则表达式 allowallowrule 匹配链接规则然后指定了回调函数为 parse_news# 最后设置 followFalse 表示不跟踪从当前链接提取的链接。# recompile the rulesuper(ExampleSpider, self).__init__(*args, **kwargs)# 调用父类CrawlSpider的初始化方法确保爬虫的正确初始化。rulesRule(LinkExtractor(allowr/\d/\d/\d/*),callbackparse_news, followTrue)# f open(out.txt, w)def printcn(suni):for i in suni:print(suni.encode(utf-8))def parse_news(self, response):item Stock163Item()item[news_thread] response.url.strip().split(/)[-1][:-5]#这行代码从响应的URL中提取新闻线程信息。它首先通过response.url获取当前页面的URL然后使用strip()方法去除首尾的空格接着使用split(/)方法根据斜杠切割URL为一个列表最后通过[-1]#取列表的最后一个元素即URL中最后一个斜杠后的部分。[: -5] 是为了去掉文件扩展名假设是.html或类似的扩展名剩下的部分就是新闻线程的信息然后将其赋值给item对象的news_thread属性。self.get_thread(response,item)self.get_title(response, item)self.get_source(response, item)self.get_url(response, item)self.get_news_from(response, item)self.get_from_url(response, item)self.get_text(response, item)return item ##############!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parsedef get_title(self, response, item):title response.xpath(/html/head/title/text()).extract()if title:# print (title:title[0][:-5].encode(utf-8))item[news_title] title[0][:-5]def get_source(self, response, item):source response.xpath(//div[classleft]/text()).extract()if source:# print (sourcesource[0][:-5].encode(utf-8))item[news_time] source[0][:-5]def get_news_from(self, response, item):news_from response.xpath(//div[classleft]/a/text()).extract()if news_from:# print fromnews_from[0].encode(utf-8)item[news_from] news_from[0]def get_from_url(self, response, item):from_url response.xpath(//div[classleft]/a/href).extract()if from_url:# print (urlfrom_url[0].encode(utf-8) )item[from_url] from_url[0]def get_text(self, response, item):news_body response.xpath(//div[idendText]/p/text()).extract()if news_body:# for entry in news_body:# print (entry.encode(utf-8))item[news_body] news_bodydef get_url(self, response, item):news_url response.urlif news_url:print(news_url)item[news_url] news_url 2.1.3 pipelines.py 接着需要对所抽取的具体要素进行处理要么显示在终端的窗口中要么存入某个地方或者数据库中现在我们假设将所抽取出来的要素构造成一个词典以JSON文档的格式存为文本文件每个页面单独存成一个文件。这个时候需要定义一个类这个类里面只有一个方法process_item(self,item,spider) # -*- coding: utf-8 -*-# Define your item pipelines here # # Dont forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html #encoding: utf-8 import os def ParseFilePath(url, id):# user should change this folder pathoutfolder e:\\data\\FinTech\\News\\Stocks\\%s % idcomponents url.split(/)year components[3]monthdaycomponents[4]month monthday[:2]day monthday[2:]idxcomponents[5]pageidx_components[6]#folder outfolder \\%s_%s_%s_ % (year, month, day)folder outfolderif ((year) | (keywords in page)):filepathxxxelse:filepath folder \\%s_%s_%s_%s.txt % (year, month, day, page) filepathfilepath.replace(?, _)return(folder, filepath)class Stock163Pipeline(object): def process_item(self, item, spider):if spider.name ! stocknews: return itemif item.get(news_thread, None) is None: return itemurl item[news_url]if keywords in url:return itemfolder, filepath ParseFilePath(url, spider.stock_id)spider.counter spider.counter1counterfilepath folder\\counter.txt#one a single machine will is virtually no risk of race-conditionif not os.path.exists(folder):os.makedirs(folder) #print(filepath, counterfilepath)#print(spider.stats)fo open(counterfilepath, w, encodingUTF-8)fo.write(str(spider.counter))fo.close()if (filepath!xxx):fo open(filepath, w, encodingutf-8)fo.write(str(dict(item)))fo.close()return None 2.2 构造可接受参数的Scrapy爬虫这节内容主要介绍能改变起始网页的地址从而使得同一个爬虫爬去不同的网站修改来start_urls同时也修改了allowed_domain保证爬虫顺利进行最后通过super方法执行这个类来更新参数 class ExampleSpider(CrawlSpider):name stocknewsdef __init__(self, id600000, page0, *args, **kwargs): #allowrule /%s/%s\d/\d/* % (year, month)allowrule /%s/%s%s/\d/* % (year, month, day) self.counter 0self.stock_id idself.start_urls [http://\%s \% (site)]ExampleSpider.rules(Rule(LinkExtractor(allowallowrule), callbackparse_news, followFalse),)#recompile the rule 2.3 运行Scrapy爬虫一种是在命令行里面执行crawl命令一种是在别的程序中调用Scrapy爬虫命令行中是单线程程序调用是多线程一次可以同时爬取不同的网站当然也可以通过twisted包里面的internet.defer方法来将每个爬虫串联起来同时调用reactor来控制执行顺序Scrapy也可以在多台机器上部署分布式 2.3.1 在命令行运行在命令行中非常简单进入项目的主目录即包含scrapy.cfg文件的那个目录输入scrapy crawl money163这里的money163是在spider.py程序文件中使用“ name“money163”定义的爬虫名字crawl是让Scrapy爬虫开始爬去网页“scrapy craw money163 -a site money.163.com/stock 2.3.2 在程序中调用在别的程序里调用Scrapy爬虫可以使用不同的类这里使用CrawlerProcess类配合get_project_setting方法就可以在项目目录中非常方面地使用别的程序运行自己的爬虫 -首先引入相应的模块和函数 from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings然后定义爬虫过程在定义的过程中先通过get_project_settings获取项目的信息再传给所定义的爬虫过程 process CrawlerProcess(get_project_settings())定义好爬虫过程后只需调用这个过程对象包括传递参数就能运行爬虫了比如process.crawl(stocknews, idstockid, pagestr(page))按照列表中的三个网址定义了三个爬虫最后通过process.start来启动爬虫因为使用了get_project_settings这个python程序需要在项目所在目录下执行才能有效运行 for site in [money.163.com, tech.163.com, money.163.com/stock]: process.crawl(myspider, site site) process.start()2.4 运行Scrapy的一些要点有些网站会对网络请求是否是网络爬虫进行识别如果发现是网络爬虫则会进行约束比如限制流量甚至直接拒绝响应因此需要合理设置setting.py和middleware文件里面的选项来实现 3 大规模非结构化数据的存储与分析非结构化的数据是指没有定义结构的数据一种典型的非结构化数据是文本包括日期、数字、人名、事件等这样的数据没有规则可循比如数据挖掘、自然语言处理、文本分析等提供了不同方法从非结构化数据里找出模式处理文本常用的技巧通常涉及到元数据或者词性标签手动标记非结构化数据一般使用schema.org定义的类型和属性作为标记比如JSON- LD当单个网页上有多种实体类型时这些实体应该都被标记例如视频schemma.org/VideoObject 4 全部代码 from keras.applications.vgg16 import VGG16 from keras.layers import Input,Flatten,Dense,Dropout from keras.models import Model from keras.optimizers import SGDfrom keras.datasets import mnistimport cv2 import h5py as h5py import numpy as npmodel_vgg VGG16(include_topFalse,weightsimagenet,input_shape(ishape,ishape,3)) model Flatten(nameflatten)(model_vgg.output) model Dense(4096,activationrelu,namefc1)(model) model Dense(4096,activationrelu,namefc2)(model) model Dropout(0.5)(model) model Dense(10,activationsoftmax)(model) model_vgg_mnist Model(model_vgg.input,model,namevgg16)model_vgg_mnist.summary()model_vgg VGG16(include_topFalse,weightsimagenet,input_shape(224,224,3)) for layer in model_vgg.layers:layer.trainableFalse model Flatten()(model_vgg.output) model Dense(4096,activationrelu,namefc1)(model) model Dense(4096,activationrelu,namefc2)(model) model Dropout(0.5)(model) model Dense(10,activationsoftmax,nameprediction)(model) model_vgg_mnist_pretrain Model(model_vgg.input,model,namevgg16_pretrain)model_vgg_mnist_pretrain.summary()sgd SGD(lr 0.05,decay1e-5) model_vgg_mnist_pretrain.compile(losscategorical_crossentropy,optimizersgd,metrics[accuracy])(x_train,y_train),(x_test,y_test) mnist.load_data() x_train [cv2.cvtColor(cv2.resize(i,(ishape,ishape)),cv2.COLOR_GRAY2BGR) for i in x_train] x_train np.concatenate([arr[np.newaxis] for arr in x_train]).astype(float32) x_test [cv2.cvtColor(cv2.resize(i,(ishape,ishape)),cv2.COLOR_GRAY2BGR) for i in x_test] x_test np.concatenate([arr[np.newaxis] for arr in x_test]).astype(float32)x_test.shape x_train.shapex_train / 255 x_test / 255np.where(x_train[0]!0)def tran_y(y):y_ohe np.zeros(10)y_ohe[y] 1return y_ohey_train_ohe np.array([tran_y(y_train[i]) for i in range(len(y_train))]) y_test_ohe np.array([tran_y(y_test[i]) for i in range(len(y_test))])model_vgg_mnist_pretrain.fit(x_train,y_train_ohe,validation_data(x_test,y_test_ohe),epochs200,batch_size128)

查看全文

http://www.zqtcl.cn/news/992455/