小编给大家分享一下python中scrapy spider有哪些爬取方式,相信大部分人都还不怎么了解,因此分享这篇文章给大家参考一下,希望大家阅读完这篇文章后大有收获,下面让我们一起去了解一下吧!
spider的几种爬取方式:
爬取1页内容
按照给定列表拼出链接爬取多页
找到‘下一页'标签进行爬取
进入链接,按照链接进行爬取
下面分别给出了示例
1.爬取1页内容
#by 寒小阳(hanxiaoyang.ml@gmail.com)
import scrapy
class JulyeduSpider(scrapy.Spider):
name = "julyedu"
start_urls = [
'https://www.julyedu.com/category/index',
]
def parse(self, response):
for julyedu_class in response.xpath('//div[@class="course_info_box"]'):
print julyedu_class.xpath('a/h5/text()').extract_first()
print julyedu_class.xpath('a/p[@class="course-info-tip"][1]/text()').extract_first()
print julyedu_class.xpath('a/p[@class="course-info-tip"][2]/text()').extract_first()
print response.urljoin(julyedu_class.xpath('a/img[1]/@src').extract_first())
print "\n"
yield {
'title':julyedu_class.xpath('a/h5/text()').extract_first(),
'desc': julyedu_class.xpath('a/p[@class="course-info-tip"][1]/text()').extract_first(),
'time': julyedu_class.xpath('a/p[@class="course-info-tip"][2]/text()').extract_first(),
'img_url': response.urljoin(julyedu_class.xpath('a/img[1]/@src').extract_first())
}
2.按照给定列表拼出链接爬取多页
#by 寒小阳(hanxiaoyang.ml@gmail.com)
import scrapy
class CnBlogSpider(scrapy.Spider):
name = "cnblogs"
allowed_domains = ["cnblogs.com"]
start_urls = [
'http://www.cnblogs.com/pick/#p%s' % p for p in xrange(1, 11)
]
def parse(self, response):
for article in response.xpath('//div[@class="post_item"]'):
print article.xpath('div[@class="post_item_body"]/h4/a/text()').extract_first().strip()
print response.urljoin(article.xpath('div[@class="post_item_body"]/h4/a/@href').extract_first()).strip()
print article.xpath('div[@class="post_item_body"]/p/text()').extract_first().strip()
print article.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/a/text()').extract_first().strip()
print response.urljoin(article.xpath('div[@class="post_item_body"]/div/a/@href').extract_first()).strip()
print article.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_comment"]/a/text()').extract_first().strip()
print article.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_view"]/a/text()').extract_first().strip()
print ""
yield {
'title': article.xpath('div[@class="post_item_body"]/h4/a/text()').extract_first().strip(),
'link': response.urljoin(article.xpath('div[@class="post_item_body"]/h4/a/@href').extract_first()).strip(),
'summary': article.xpath('div[@class="post_item_body"]/p/text()').extract_first().strip(),
'author': article.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/a/text()').extract_first().strip(),
'author_link': response.urljoin(article.xpath('div[@class="post_item_body"]/div/a/@href').extract_first()).strip(),
'comment': article.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_comment"]/a/text()').extract_first().strip(),
'view': article.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_view"]/a/text()').extract_first().strip(),
}
3.找到‘下一页'标签进行爬取
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/tag/humor/',
]
def parse(self, response):
for quote in response.xpath('//div[@class="quote"]'):
yield {
'text': quote.xpath('span[@class="text"]/text()').extract_first(),
'author': quote.xpath('span/small[@class="author"]/text()').extract_first(),
}
next_page = response.xpath('//li[@class="next"]/@herf').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
4.进入链接,按照链接进行爬取
#by 寒小阳(hanxiaoyang.ml@gmail.com)
import scrapy
class QQNewsSpider(scrapy.Spider):
name = 'qqnews'
start_urls = ['http://news.qq.com/society_index.shtml']
def parse(self, response):
for href in response.xpath('//*[@id="news"]/div/div/div/div/em/a/@href'):
full_url = response.urljoin(href.extract())
yield scrapy.Request(full_url, callback=self.parse_question)
def parse_question(self, response):
print response.xpath('//div[@class="qq_article"]/div/h2/text()').extract_first()
print response.xpath('//span[@class="a_time"]/text()').extract_first()
print response.xpath('//span[@class="a_catalog"]/a/text()').extract_first()
print "\n".join(response.xpath('//div[@id="Cnt-Main-Article-QQ"]/p[@class="text"]/text()').extract())
print ""
yield {
'title': response.xpath('//div[@class="qq_article"]/div/h2/text()').extract_first(),
'content': "\n".join(response.xpath('//div[@id="Cnt-Main-Article-QQ"]/p[@class="text"]/text()').extract()),
'time': response.xpath('//span[@class="a_time"]/text()').extract_first(),
'cate': response.xpath('//span[@class="a_catalog"]/a/text()').extract_first(),
}
以上是“python中scrapy spider有哪些爬取方式”这篇文章的所有内容,感谢各位的阅读!相信大家都有了一定的了解,希望分享的内容对大家有所帮助,如果还想学习更多知识,欢迎关注天达云行业资讯频道!