代码如下:
import requests
from pyquery import PyQuery
import re
import os
import csv
import datetime
"""
说明:该代码是专门为爬取http://www.kgtmall.com.cn/商品而设计的。
使用方法:
1、在本地提前安装好python3的环境;
2、直接运行本代码;
3、运行本代码完后,会在当前目录生成一个result.csv文件,该文件里面就存了爬取该站点的商品信息
注意事项:在本代码运行期间,不能打开result.csv文件,因为这样程序就写不进去数据了;只能等本代码
全部运行结束后,才能打开esult.csv文件进行查看。
"""
def get_html_text(url):
"""
获取首页源代码
:param url:
:return:
"""
r = requests.get(url)
return r.text
def get_one_level_class(home_url):
"""
一级标题
母婴用品 http://www.kgtmall.com.cn/mall/list.php?catid=4
生活家居 http://www.kgtmall.com.cn/mall/list.php?catid=5
"""
html = get_html_text(home_url)
jpy = PyQuery(html)
items = jpy('.menu_title a')
for line in items:
jpy = PyQuery(line)
one_level_url = jpy('a').attr('href')
one_level_title = jpy('a').text()
yield one_level_url, one_level_title
def get_two_level_class(home_url):
"""
二级标题
母婴用品 营养辅食 http://www.kgtmall.com.cn/mall/search.php?catid=539
母婴用品 妈妈专区 http://www.kgtmall.com.cn/mall/search.php?catid=544
母婴用品 婴儿保健 http://www.kgtmall.com.cn/mall/search.php?catid=887
"""
for one_level_url, one_level_title in get_one_level_class(home_url):
jpy = PyQuery(one_level_url)
items = jpy('.selector_category li')
for line in items:
jpy = PyQuery(line)
two_level_url = jpy('a').attr('href')
two_level_title = jpy('a').text()
yield one_level_title, two_level_title, two_level_url
def get_pages(url):
"""
获取页数
:return:
"""
jpy = PyQuery(url)
pages = jpy('.pagination cite').text()
print('原pages:', pages)
try:
pages = int(re.findall('共.*?条/(.*)页', pages)[0])
except Exception as e:
print(e)
pages = 1
print('页码:', pages)
return pages
def get_three_level_class(home_url):
"""
三级标题
母婴用品 营养辅食 DHA http://www.kgtmall.com.cn/mall/search.php?catid=548
母婴用品 营养辅食 益生菌/初乳 http://www.kgtmall.com.cn/mall/search.php?catid=549
母婴用品 营养辅食 清火/开胃/驱虫 http://www.kgtmall.com.cn/mall/search.php?catid=550
"""
for one_level_title, two_level_title, two_level_url in get_two_level_class(home_url):
jpy = PyQuery(two_level_url)
items = jpy('.selector_category li')
for line in items:
jpy = PyQuery(line)
three_level_title = jpy('a').text()
three_level_url = jpy('a').attr('href')
catid = re.findall('http://www.kgtmall.com.cn/mall/search.php\?catid=(.*)', three_level_url)[0]
pages = get_pages(three_level_url)
# for index in range(1, 3):
for index in range(1, pages + 1):
three_level_url_by_xiaoliang = 'http://www.kgtmall.com.cn/mall/search.php?kw=&list=0&catid={}&order=10&minprice=&maxprice=&page={}'.format(
catid, index)
yield one_level_title, two_level_title, three_level_title, three_level_url_by_xiaoliang
def shop_title_and_url(home_url):
"""
商品标题和url
母婴用品 营养辅食 DHA 澳洲直邮 澳大利亚RIFOLD 儿童DHA90粒(一月以上适用) http://www.kgtmall.com.cn/mall/show.php?itemid=28089
母婴用品 营养辅食 益生菌/初乳 澳大利亚 Maxigenes美可卓 全脂高钙奶粉(蓝胖子)1kg 两罐装 http://www.kgtmall.com.cn/mall/show.php?itemid=23486
"""
for one_level_title, two_level_title, three_level_title, three_level_url_by_xiaoliang in get_three_level_class(home_url):
jpy = PyQuery(three_level_url_by_xiaoliang)
items = jpy('.list_img a')
for line in items:
jpy = PyQuery(line)
shop_url = jpy('a').attr('href')
shop_title = jpy('a img').attr('alt')
yield one_level_title, two_level_title, three_level_title, shop_title, shop_url
def get_shop_info(home_url, count):
for one_level_title, two_level_title, three_level_title, shop_title, shop_url in shop_title_and_url(home_url):
print('--排错:' + one_level_title, two_level_title, three_level_title, shop_title, shop_url)
jpy = PyQuery(shop_url)
price = jpy('.price').text()
# 条形码
bar_code = jpy('.bar_code dl dd p').text()
goods_detail = jpy('#content')
try:
guige = re.findall('规格:(.*)', goods_detail.text())[0]
except:
guige = '没有规格'
try:
chandi = re.findall('产地:(.*)', goods_detail.text())[0]
except:
chandi = '没有产地'
print(count, one_level_title, two_level_title, three_level_title, shop_title, bar_code, chandi, guige, price, shop_url)
row = ([one_level_title, two_level_title, three_level_title, shop_title, bar_code, chandi, guige, price, shop_url])
ppath = os.path.dirname(__file__)
csv_file = ppath + '/result.csv'
# newline是为了解决csv文件里面有多余的空行,encoding是为了解决写不进csv数据报字符集的报错
with open(csv_file, 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(row)
count += 1
def main():
# 记录一下开始时间
start_time = datetime.datetime.now()
home_url = 'http://www.kgtmall.com.cn/'
# 当前代码路径
ppath = os.path.dirname(__file__)
csv_file = ppath + '/result.csv'
headers = (['一级分类', '二级分类', '三级分类', '商品名称', '条码', '产地', '规格', '价格', '商品链接'])
# newline是为了解决csv文件里面有多余的空行,encoding是为了解决写不进csv数据报字符集的报错
with open(csv_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(headers)
count = 1
get_shop_info(home_url, 1)
# 记录一下结束时间
end_time = datetime.datetime.now()
# 记录程序执行用时
timediff = end_time - start_time
print('总共用时{}秒\n'.format(str(timediff.seconds)))
print('全部商品已经按需求完成!!!')
if __name__ == '__main__':
main()
运行后,会在当前目录下生成个result.csv文件,内容如下: