通过Python Selenium多进程爬取优书网书籍信息并实时写入CSV

kamisamak 发布于 2020-08-05 1914 次阅读


[infobox title="多进程"]

import csv
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

csvPath = 'C:/tool/dev/python/PycharmProjects/day0617/yousuu/data/yousuu.csv'


# 下载书本封面
def save_image_file(url, path):
    jd = requests.get(url)
    if jd.status_code == 200:
        with open(path, 'wb') as f:
            f.write(jd.content)
            f.close()


# 获取代理池中的代理
def get_proxy():
    return requests.get("http://118.24.52.95/get/").json()


# 删除代理池中代理
def delete_proxy(proxy):
    requests.get("http://118.24.52.95/delete/?proxy={}".format(proxy))


def getHtml(url):
    # ....
    retry_count = 5
    proxy = get_proxy().get("proxy")
    while retry_count > 0:
        try:
            html = requests.get(url, proxies={"http": "http://{}".format(proxy)})
            # 使用代理访问
            return html
        except Exception:
            retry_count -= 1
    # 出错5次, 删除代理池中代理
    delete_proxy(proxy)
    return None


def save_to_csv(csvPath, lst):
    with open(csvPath, 'a+', encoding='utf-8', newline='') as f:
        # f = open(csvPath, 'w', encoding='utf-8')
        writer = csv.writer(f, dialect='excel')
        for info_list in lst:
            writer.writerow(info_list)
    # f.close()


def get_info(url):
    print(url)
    # 使用开发者模式
    # options = webdriver.ChromeOptions()
    # options.add_argument('--user-agent=%s' % ua)
    # options.add_argument('--proxy-server=http://%s' % get_proxy())
    # options.add_experimental_option('excludeSwitches', ['enable-automation'])
    # browser = webdriver.Chrome(executable_path=driver_path,chrome_options=options)
    browser = webdriver.Chrome()
    browser.get(url)
    # 显示等待
    try:
        element = WebDriverWait(browser, 20).until(
            EC.presence_of_element_located((By.XPATH, '/html/body/div/div/div/div[2]'))
        )
        # print(element)
        html = browser.page_source
    finally:
        # browser.quit()
        browser.close()
    print(url + "完成")
    soup = BeautifulSoup(html, "html.parser")
    div_list = soup.find_all('div', class_='list-card-layout full-mode-book')
    # print(soup.find_all("script"))
    data_list = []
    # 把数据存入列表
    for each in div_list:
        bookInfo = each.find("div", class_="list-card-content")
        # 封面
        bookPicL = str(each.find("img"))
        startL = bookPicL.find('src="') + len('src="')
        endL = startL + bookPicL[startL:].find('"')
        bookPicUrl = bookPicL[startL:endL]
        # 书名
        bookName = bookInfo.div.a.text
        # 作者
        author = bookInfo.find("p").find('a', class_='author-name ellipsis').text
        # 标签
        bookLabel = bookInfo.find("p", class_="bookinfo-tags").text.replace("本书标签:", "").replace(" ", "").replace("\r",
                                                                                                                  "").replace(
            '\n', '')
        # 字数
        wordCount = bookInfo.p.span.text
        info_list = [bookName, author, bookLabel, wordCount, bookPicUrl]
        data_list.append(info_list)
    # 设置休眠时间
    # time.sleep(1)
    return data_list


def run(url):
    save_to_csv(csvPath, get_info(url))


# 程序主入口
if __name__ == '__main__':
    with open(csvPath, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f, dialect='excel')
        writer.writerow(["bookName", "author", "bookLabel", "wordCount", "bookPicUrl"])
    pool = Pool(4)
    # pool.map(run,['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i))for i in range(1, 11335)])
    pool.map(run, [
        'https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for
        i in range(1, 501)])
    pool.close()
    pool.join()

[/infobox]
[infobox title="多线程"]

import csv
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from threading import Thread
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

csvPath = 'C:/tool/dev/python/PycharmProjects/day0617/yousuu/data/yousuu.csv'


# 下载书本封面
def save_image_file(url, path):
    jd = requests.get(url)
    if jd.status_code == 200:
        with open(path, 'wb') as f:
            f.write(jd.content)
            f.close()


# 获取代理池中的代理
def get_proxy():
    return requests.get("http://118.24.52.95/get/").json()


# 删除代理池中代理
def delete_proxy(proxy):
    requests.get("http://118.24.52.95/delete/?proxy={}".format(proxy))


def getHtml(url):
    # ....
    retry_count = 5
    proxy = get_proxy().get("proxy")
    while retry_count > 0:
        try:
            html = requests.get(url, proxies={"http": "http://{}".format(proxy)})
            # 使用代理访问
            return html
        except Exception:
            retry_count -= 1
    # 出错5次, 删除代理池中代理
    delete_proxy(proxy)
    return None


def save_to_csv(csvPath, lst):
    with open(csvPath, 'a+', encoding='utf-8', newline='') as f:
        # f = open(csvPath, 'w', encoding='utf-8')
        writer = csv.writer(f, dialect='excel')
        for info_list in lst:
            writer.writerow(info_list)
    # f.close()


def get_info(url):
    print(url)
    # 使用开发者模式
    # options = webdriver.ChromeOptions()
    # options.add_argument('--user-agent=%s' % ua)
    # options.add_argument('--proxy-server=http://%s' % get_proxy())
    # options.add_experimental_option('excludeSwitches', ['enable-automation'])
    # browser = webdriver.Chrome(executable_path=driver_path,chrome_options=options)
    browser = webdriver.Chrome()
    browser.get(url)
    # 显示等待
    try:
        element = WebDriverWait(browser, 20).until(
            EC.presence_of_element_located((By.XPATH, '/html/body/div/div/div/div[2]'))
        )
        # print(element)
        html = browser.page_source
    finally:
        # browser.quit()
        browser.close()
    print(url + "完成")
    soup = BeautifulSoup(html, "html.parser")
    div_list = soup.find_all('div', class_='list-card-layout full-mode-book')
    # print(soup.find_all("script"))
    data_list = []
    # 把数据存入列表
    for each in div_list:
        bookInfo = each.find("div", class_="list-card-content")
        # 封面
        bookPicL = str(each.find("img"))
        startL = bookPicL.find('src="') + len('src="')
        endL = startL + bookPicL[startL:].find('"')
        bookPicUrl = bookPicL[startL:endL]
        # 书名
        bookName = bookInfo.div.a.text
        # 作者
        author = bookInfo.find("p").find('a', class_='author-name ellipsis').text
        # 标签
        bookLabel = bookInfo.find("p", class_="bookinfo-tags").text.replace("本书标签:", "").replace(" ", "").replace("\r",
                                                                                                                  "").replace(
            '\n', '')
        # 字数
        wordCount = bookInfo.p.span.text
        info_list = [bookName, author, bookLabel, wordCount, bookPicUrl]
        data_list.append(info_list)
    # 设置休眠时间
    # time.sleep(1)
    return data_list


def run(urls):
    for url in urls:
        save_to_csv(csvPath, get_info(url))



# 程序主入口
if __name__ == '__main__':
    with open(csvPath, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f, dialect='excel')
        writer.writerow(["bookName", "author", "bookLabel", "wordCount", "bookPicUrl"])
    urls1 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
    urls2 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
    urls3 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
    urls4 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
    # 开启4个线程,传入爬取的地址
    thead_list = []
    t1 = Thread(target=run(urls1))
    t1.start()
    t2 = Thread(target=run(urls2))
    t2.start()
    t3 = Thread(target=run(urls3))
    t3.start()
    t4 = Thread(target=run(urls4))
    t4.start()
    thead_list.append(t1)
    thead_list.append(t2)
    thead_list.append(t3)
    thead_list.append(t4)
    for t in thead_list:
        t.join()

[/infobox]
[infobox title="随机UA&代理池"]
[hide reply_to_this="true"]

# -*- coding: utf-8 -*-

import urllib.request
import random
import time
import requests

proxy_list = []


def get_proxy_list():
    global proxy_list
    print("导入proxy_list...")
    # print"导入proxy_list...".decode('utf-8')
    # ip文件可以浏览我上文链接文章“多线程爬虫——抓取代理ip”
    f = open("ip.txt")
    # 从文件中读取的line会有回车,要把\n去掉
    line = f.readline().strip('\n')
    while line:
        proxy_list.append(line)
        line = f.readline().strip('\n')
    f.close()


def start():
    # 总次数和有效次数
    times = 0
    finished_times = 0
    # 无限刷
    while 1:
        user_agent_list = [
            {'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0)'},
            {'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)'},
            {'Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)'},
            {'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11'},
            {'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1'},
            {'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER'},
            {'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)'},
            {'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0'},
            {'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36'},
            {'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'}]

        referer_list = [
            {'http://blog.csdn.net/dala_da/article/details/79401163'},
            {'http://blog.csdn.net/'},
            {'https://www.sogou.com/tx?query=%E4%BD%BF%E7%94%A8%E7%88%AC%E8%99%AB%E5%88%B7csdn%E8%AE%BF%E9%97%AE%E9%87%8F&hdq=sogou-site-706608cfdbcc1886-0001&ekv=2&ie=utf8&cid=qb7.zhuye&'},
            {'https://www.baidu.com/s?tn=98074231_1_hao_pg&word=%E4%BD%BF%E7%94%A8%E7%88%AC%E8%99%AB%E5%88%B7csdn%E8%AE%BF%E9%97%AE%E9%87%8F'}]
        # 想要刷的blog的url
        url = 'http://blog.csdn.net/dala_da/article/details/79401163'
        # 随机user_agent和Referer
        header = {'User-Agent': random.choice(user_agent_list),
                  'Referer': random.choice(referer_list)
                  }
        # 依次从proxy_list中取
        ip = proxy_list[times % len(proxy_list)]
        # 设置代理,格式如下
        proxy_ip = 'http://' + ip
        proxy_ips = 'https://' + ip
        proxy = {'https': proxy_ips, 'http': proxy_ip}

        try:
            response = requests.get(url, headers=header, proxies=proxy)
        except:
            # 无响应则print出该代理ip
            print('代理出问题啦')
            # '代理出问题啦:'.decode('utf-8') + proxy["https"]
            time.sleep(0.1)
        else:
            print('已刷%d次,%s') % (finished_times + 1, proxy["https"])
            time.sleep(random.random())
            finished_times += 1

        times += 1
        # 每当所有的代理ip刷过一轮,延时15秒
        if not times % len(proxy_list):
            time.sleep(15)


if __name__ == "__main__":
    get_proxy_list()
    start()

[/hide]
[/infobox]