TeRiTeRi

  1. 首页
  2. Python
  3. 正文

通过Python Selenium多进程爬取优书网书籍信息并实时写入CSV

2020年8月5日 1772点热度 0人点赞 0条评论
多进程

import csv
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

csvPath = 'C:/tool/dev/python/PycharmProjects/day0617/yousuu/data/yousuu.csv'


# 下载书本封面
def save_image_file(url, path):
    jd = requests.get(url)
    if jd.status_code == 200:
        with open(path, 'wb') as f:
            f.write(jd.content)
            f.close()


# 获取代理池中的代理
def get_proxy():
    return requests.get("http://118.24.52.95/get/").json()


# 删除代理池中代理
def delete_proxy(proxy):
    requests.get("http://118.24.52.95/delete/?proxy={}".format(proxy))


def getHtml(url):
    # ....
    retry_count = 5
    proxy = get_proxy().get("proxy")
    while retry_count > 0:
        try:
            html = requests.get(url, proxies={"http": "http://{}".format(proxy)})
            # 使用代理访问
            return html
        except Exception:
            retry_count -= 1
    # 出错5次, 删除代理池中代理
    delete_proxy(proxy)
    return None


def save_to_csv(csvPath, lst):
    with open(csvPath, 'a+', encoding='utf-8', newline='') as f:
        # f = open(csvPath, 'w', encoding='utf-8')
        writer = csv.writer(f, dialect='excel')
        for info_list in lst:
            writer.writerow(info_list)
    # f.close()


def get_info(url):
    print(url)
    # 使用开发者模式
    # options = webdriver.ChromeOptions()
    # options.add_argument('--user-agent=%s' % ua)
    # options.add_argument('--proxy-server=http://%s' % get_proxy())
    # options.add_experimental_option('excludeSwitches', ['enable-automation'])
    # browser = webdriver.Chrome(executable_path=driver_path,chrome_options=options)
    browser = webdriver.Chrome()
    browser.get(url)
    # 显示等待
    try:
        element = WebDriverWait(browser, 20).until(
            EC.presence_of_element_located((By.XPATH, '/html/body/div/div/div/div[2]'))
        )
        # print(element)
        html = browser.page_source
    finally:
        # browser.quit()
        browser.close()
    print(url + "完成")
    soup = BeautifulSoup(html, "html.parser")
    div_list = soup.find_all('div', class_='list-card-layout full-mode-book')
    # print(soup.find_all("script"))
    data_list = []
    # 把数据存入列表
    for each in div_list:
        bookInfo = each.find("div", class_="list-card-content")
        # 封面
        bookPicL = str(each.find("img"))
        startL = bookPicL.find('src="') + len('src="')
        endL = startL + bookPicL[startL:].find('"')
        bookPicUrl = bookPicL[startL:endL]
        # 书名
        bookName = bookInfo.div.a.text
        # 作者
        author = bookInfo.find("p").find('a', class_='author-name ellipsis').text
        # 标签
        bookLabel = bookInfo.find("p", class_="bookinfo-tags").text.replace("本书标签:", "").replace(" ", "").replace("\r",
                                                                                                                  "").replace(
            '\n', '')
        # 字数
        wordCount = bookInfo.p.span.text
        info_list = [bookName, author, bookLabel, wordCount, bookPicUrl]
        data_list.append(info_list)
    # 设置休眠时间
    # time.sleep(1)
    return data_list


def run(url):
    save_to_csv(csvPath, get_info(url))


# 程序主入口
if __name__ == '__main__':
    with open(csvPath, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f, dialect='excel')
        writer.writerow(["bookName", "author", "bookLabel", "wordCount", "bookPicUrl"])
    pool = Pool(4)
    # pool.map(run,['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i))for i in range(1, 11335)])
    pool.map(run, [
        'https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for
        i in range(1, 501)])
    pool.close()
    pool.join()


多线程

import csv
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from threading import Thread
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

csvPath = 'C:/tool/dev/python/PycharmProjects/day0617/yousuu/data/yousuu.csv'


# 下载书本封面
def save_image_file(url, path):
    jd = requests.get(url)
    if jd.status_code == 200:
        with open(path, 'wb') as f:
            f.write(jd.content)
            f.close()


# 获取代理池中的代理
def get_proxy():
    return requests.get("http://118.24.52.95/get/").json()


# 删除代理池中代理
def delete_proxy(proxy):
    requests.get("http://118.24.52.95/delete/?proxy={}".format(proxy))


def getHtml(url):
    # ....
    retry_count = 5
    proxy = get_proxy().get("proxy")
    while retry_count > 0:
        try:
            html = requests.get(url, proxies={"http": "http://{}".format(proxy)})
            # 使用代理访问
            return html
        except Exception:
            retry_count -= 1
    # 出错5次, 删除代理池中代理
    delete_proxy(proxy)
    return None


def save_to_csv(csvPath, lst):
    with open(csvPath, 'a+', encoding='utf-8', newline='') as f:
        # f = open(csvPath, 'w', encoding='utf-8')
        writer = csv.writer(f, dialect='excel')
        for info_list in lst:
            writer.writerow(info_list)
    # f.close()


def get_info(url):
    print(url)
    # 使用开发者模式
    # options = webdriver.ChromeOptions()
    # options.add_argument('--user-agent=%s' % ua)
    # options.add_argument('--proxy-server=http://%s' % get_proxy())
    # options.add_experimental_option('excludeSwitches', ['enable-automation'])
    # browser = webdriver.Chrome(executable_path=driver_path,chrome_options=options)
    browser = webdriver.Chrome()
    browser.get(url)
    # 显示等待
    try:
        element = WebDriverWait(browser, 20).until(
            EC.presence_of_element_located((By.XPATH, '/html/body/div/div/div/div[2]'))
        )
        # print(element)
        html = browser.page_source
    finally:
        # browser.quit()
        browser.close()
    print(url + "完成")
    soup = BeautifulSoup(html, "html.parser")
    div_list = soup.find_all('div', class_='list-card-layout full-mode-book')
    # print(soup.find_all("script"))
    data_list = []
    # 把数据存入列表
    for each in div_list:
        bookInfo = each.find("div", class_="list-card-content")
        # 封面
        bookPicL = str(each.find("img"))
        startL = bookPicL.find('src="') + len('src="')
        endL = startL + bookPicL[startL:].find('"')
        bookPicUrl = bookPicL[startL:endL]
        # 书名
        bookName = bookInfo.div.a.text
        # 作者
        author = bookInfo.find("p").find('a', class_='author-name ellipsis').text
        # 标签
        bookLabel = bookInfo.find("p", class_="bookinfo-tags").text.replace("本书标签:", "").replace(" ", "").replace("\r",
                                                                                                                  "").replace(
            '\n', '')
        # 字数
        wordCount = bookInfo.p.span.text
        info_list = [bookName, author, bookLabel, wordCount, bookPicUrl]
        data_list.append(info_list)
    # 设置休眠时间
    # time.sleep(1)
    return data_list


def run(urls):
    for url in urls:
        save_to_csv(csvPath, get_info(url))



# 程序主入口
if __name__ == '__main__':
    with open(csvPath, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f, dialect='excel')
        writer.writerow(["bookName", "author", "bookLabel", "wordCount", "bookPicUrl"])
    urls1 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
    urls2 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
    urls3 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
    urls4 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
    # 开启4个线程,传入爬取的地址
    thead_list = []
    t1 = Thread(target=run(urls1))
    t1.start()
    t2 = Thread(target=run(urls2))
    t2.start()
    t3 = Thread(target=run(urls3))
    t3.start()
    t4 = Thread(target=run(urls4))
    t4.start()
    thead_list.append(t1)
    thead_list.append(t2)
    thead_list.append(t3)
    thead_list.append(t4)
    for t in thead_list:
        t.join()


随机UA&代理池


[hide reply_to_this="true"]

# -*- coding: utf-8 -*-

import urllib.request
import random
import time
import requests

proxy_list = []


def get_proxy_list():
    global proxy_list
    print("导入proxy_list...")
    # print"导入proxy_list...".decode('utf-8')
    # ip文件可以浏览我上文链接文章“多线程爬虫——抓取代理ip”
    f = open("ip.txt")
    # 从文件中读取的line会有回车,要把\n去掉
    line = f.readline().strip('\n')
    while line:
        proxy_list.append(line)
        line = f.readline().strip('\n')
    f.close()


def start():
    # 总次数和有效次数
    times = 0
    finished_times = 0
    # 无限刷
    while 1:
        user_agent_list = [
            {'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0)'},
            {'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)'},
            {'Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)'},
            {'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11'},
            {'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1'},
            {'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER'},
            {'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)'},
            {'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0'},
            {'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36'},
            {'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'}]

        referer_list = [
            {'http://blog.csdn.net/dala_da/article/details/79401163'},
            {'http://blog.csdn.net/'},
            {'https://www.sogou.com/tx?query=%E4%BD%BF%E7%94%A8%E7%88%AC%E8%99%AB%E5%88%B7csdn%E8%AE%BF%E9%97%AE%E9%87%8F&hdq=sogou-site-706608cfdbcc1886-0001&ekv=2&ie=utf8&cid=qb7.zhuye&'},
            {'https://www.baidu.com/s?tn=98074231_1_hao_pg&word=%E4%BD%BF%E7%94%A8%E7%88%AC%E8%99%AB%E5%88%B7csdn%E8%AE%BF%E9%97%AE%E9%87%8F'}]
        # 想要刷的blog的url
        url = 'http://blog.csdn.net/dala_da/article/details/79401163'
        # 随机user_agent和Referer
        header = {'User-Agent': random.choice(user_agent_list),
                  'Referer': random.choice(referer_list)
                  }
        # 依次从proxy_list中取
        ip = proxy_list[times % len(proxy_list)]
        # 设置代理,格式如下
        proxy_ip = 'http://' + ip
        proxy_ips = 'https://' + ip
        proxy = {'https': proxy_ips, 'http': proxy_ip}

        try:
            response = requests.get(url, headers=header, proxies=proxy)
        except:
            # 无响应则print出该代理ip
            print('代理出问题啦')
            # '代理出问题啦:'.decode('utf-8') + proxy["https"]
            time.sleep(0.1)
        else:
            print('已刷%d次,%s') % (finished_times + 1, proxy["https"])
            time.sleep(random.random())
            finished_times += 1

        times += 1
        # 每当所有的代理ip刷过一轮,延时15秒
        if not times % len(proxy_list):
            time.sleep(15)


if __name__ == "__main__":
    get_proxy_list()
    start()

[/hide]

标签: Python 爬虫
最后更新:2020年8月5日

kamisamak

这个人很懒,什么都没留下

点赞
< 上一篇
下一篇 >

文章评论

razz evil exclaim smile redface biggrin eek confused idea lol mad twisted rolleyes wink cool arrow neutral cry mrgreen drooling persevering
取消回复

COPYRIGHT © 2023 TeRiTeRi. ALL RIGHTS RESERVED.

Theme Kratos Made By Seaton Jiang