通过python爬虫爬取豆瓣图书Top250

2020年6月22日 1440点热度 1人点赞 0条评论

robot信息查询

from urllib.robotparser import RobotFileParser

UrlRobots = 'https://book.douban.com/robots.txt'

def GetRobotsTxt(url) :
    rp = RobotFileParser()
    rp.set_url(url)
    rp.read()
    print(rp.can_fetch('*', 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'))
    print(rp.can_fetch('*', 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4'))
    print(rp.can_fetch('*', 'https://book.douban.com/'))

GetRobotsTxt(UrlRobots)

code

#  导入相关的库
from lxml import etree
import requests
import xlwt
import os
import fake_useragent

# ua = UserAgent(use_cache_server=False)
# ua = UserAgent(cache=False)
# ua = UserAgent(verify_ssl=False)


# 初始化列表,存入爬虫数据
all_info_list = []

# 表头信息
header = ['name', 'url', 'author', 'publisher', 'date', 'price', 'rate', 'comment', 'bookPic']

#  构造urls
urls = ['https://book.douban.com/top250?start={}'.format(i) for i in range(0, 250, 25)]


# 随机构建请求头
# from:https://fake-useragent.herokuapp.com/browsers/0.1.11
def get_header():
    location = os.getcwd() + '/data/fake_useragent.json'
    ua = fake_useragent.UserAgent(path=location)
    return ua.random


# 下载书本封面
def save_image_file(url, path):
    jd = requests.get(url)
    if jd.status_code == 200:
        with open(path, 'wb') as f:
            f.write(jd.content)
            f.close()


for url in urls:
    # 用requests库获取网页信息,lxml解析html文件
    html = requests.get(url, headers=headers)
    selector = etree.HTML(html.text)

    # 取大标签
    infos = selector.xpath('//tr[@class="item"]')

    for info in infos:
        # 数名
        name = info.xpath('td/div/a/@title')[0]
        # 地址
        url = info.xpath('td/div/a/@href')[0]
        book_infos = info.xpath('td/p/text()')[0]
        # 作者
        author = book_infos.split('/')[0]
        # 出版社
        publisher = book_infos.split('/')[-3]
        # 日期
        date = book_infos.split('/')[-2]
        # 价格
        price = book_infos.split('/')[-1]
        # 分数
        rate = info.xpath('td[2]/div[2]/span[2]/text()')[0]
        # 议论
        comments = info.xpath('td/p/span/text()')
        comment = comments[0] if len(comments) != 0 else "空"
        # 头图
        bookPic = info.xpath('td[1]/a/img/@src')[0]
        print(name, url, author, publisher, date, price, rate, comment, bookPic)
        all_info_list.append([name, url, author, publisher, date, price, rate, comment, bookPic])

    book = xlwt.Workbook(encoding='utf_8')
    #  创建工作表
    sheet = book.add_sheet('Shee1')

    #  python range() 函数可创建一个整数列表，一般用在 for 循环中。
    #  Python len() 方法返回对象（字符、列表、元组等）长度或项目个数。
    for h in range(len(header)):
        #   写入表头
        sheet.write(0, h, header[h])
        i = 1
    #  for循环，数据存入表格
    for list in all_info_list:
        j = 0
        for data in list:
            sheet.write(i, j, data)
            # 查看结果
            # print(data)
            j += 1
        i += 1
    #  将excel保存至设置路径
    book.save('doubanbookTop250/doubanbookTop250.xls')
    # 便利循环 图片存储
    for info in all_info_list:
        save_image_file(info[8], "doubanbookTop250/" + info[0] + ".jpg")

通过python爬虫爬取豆瓣图书Top250

文章评论