多进程
import csv
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
csvPath = 'C:/tool/dev/python/PycharmProjects/day0617/yousuu/data/yousuu.csv'
# 下载书本封面
def save_image_file(url, path):
jd = requests.get(url)
if jd.status_code == 200:
with open(path, 'wb') as f:
f.write(jd.content)
f.close()
# 获取代理池中的代理
def get_proxy():
return requests.get("http://118.24.52.95/get/").json()
# 删除代理池中代理
def delete_proxy(proxy):
requests.get("http://118.24.52.95/delete/?proxy={}".format(proxy))
def getHtml(url):
# ....
retry_count = 5
proxy = get_proxy().get("proxy")
while retry_count > 0:
try:
html = requests.get(url, proxies={"http": "http://{}".format(proxy)})
# 使用代理访问
return html
except Exception:
retry_count -= 1
# 出错5次, 删除代理池中代理
delete_proxy(proxy)
return None
def save_to_csv(csvPath, lst):
with open(csvPath, 'a+', encoding='utf-8', newline='') as f:
# f = open(csvPath, 'w', encoding='utf-8')
writer = csv.writer(f, dialect='excel')
for info_list in lst:
writer.writerow(info_list)
# f.close()
def get_info(url):
print(url)
# 使用开发者模式
# options = webdriver.ChromeOptions()
# options.add_argument('--user-agent=%s' % ua)
# options.add_argument('--proxy-server=http://%s' % get_proxy())
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
# browser = webdriver.Chrome(executable_path=driver_path,chrome_options=options)
browser = webdriver.Chrome()
browser.get(url)
# 显示等待
try:
element = WebDriverWait(browser, 20).until(
EC.presence_of_element_located((By.XPATH, '/html/body/div/div/div/div[2]'))
)
# print(element)
html = browser.page_source
finally:
# browser.quit()
browser.close()
print(url + "完成")
soup = BeautifulSoup(html, "html.parser")
div_list = soup.find_all('div', class_='list-card-layout full-mode-book')
# print(soup.find_all("script"))
data_list = []
# 把数据存入列表
for each in div_list:
bookInfo = each.find("div", class_="list-card-content")
# 封面
bookPicL = str(each.find("img"))
startL = bookPicL.find('src="') + len('src="')
endL = startL + bookPicL[startL:].find('"')
bookPicUrl = bookPicL[startL:endL]
# 书名
bookName = bookInfo.div.a.text
# 作者
author = bookInfo.find("p").find('a', class_='author-name ellipsis').text
# 标签
bookLabel = bookInfo.find("p", class_="bookinfo-tags").text.replace("本书标签:", "").replace(" ", "").replace("\r",
"").replace(
'\n', '')
# 字数
wordCount = bookInfo.p.span.text
info_list = [bookName, author, bookLabel, wordCount, bookPicUrl]
data_list.append(info_list)
# 设置休眠时间
# time.sleep(1)
return data_list
def run(url):
save_to_csv(csvPath, get_info(url))
# 程序主入口
if __name__ == '__main__':
with open(csvPath, 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f, dialect='excel')
writer.writerow(["bookName", "author", "bookLabel", "wordCount", "bookPicUrl"])
pool = Pool(4)
# pool.map(run,['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i))for i in range(1, 11335)])
pool.map(run, [
'https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for
i in range(1, 501)])
pool.close()
pool.join()
多线程
import csv
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from threading import Thread
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
csvPath = 'C:/tool/dev/python/PycharmProjects/day0617/yousuu/data/yousuu.csv'
# 下载书本封面
def save_image_file(url, path):
jd = requests.get(url)
if jd.status_code == 200:
with open(path, 'wb') as f:
f.write(jd.content)
f.close()
# 获取代理池中的代理
def get_proxy():
return requests.get("http://118.24.52.95/get/").json()
# 删除代理池中代理
def delete_proxy(proxy):
requests.get("http://118.24.52.95/delete/?proxy={}".format(proxy))
def getHtml(url):
# ....
retry_count = 5
proxy = get_proxy().get("proxy")
while retry_count > 0:
try:
html = requests.get(url, proxies={"http": "http://{}".format(proxy)})
# 使用代理访问
return html
except Exception:
retry_count -= 1
# 出错5次, 删除代理池中代理
delete_proxy(proxy)
return None
def save_to_csv(csvPath, lst):
with open(csvPath, 'a+', encoding='utf-8', newline='') as f:
# f = open(csvPath, 'w', encoding='utf-8')
writer = csv.writer(f, dialect='excel')
for info_list in lst:
writer.writerow(info_list)
# f.close()
def get_info(url):
print(url)
# 使用开发者模式
# options = webdriver.ChromeOptions()
# options.add_argument('--user-agent=%s' % ua)
# options.add_argument('--proxy-server=http://%s' % get_proxy())
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
# browser = webdriver.Chrome(executable_path=driver_path,chrome_options=options)
browser = webdriver.Chrome()
browser.get(url)
# 显示等待
try:
element = WebDriverWait(browser, 20).until(
EC.presence_of_element_located((By.XPATH, '/html/body/div/div/div/div[2]'))
)
# print(element)
html = browser.page_source
finally:
# browser.quit()
browser.close()
print(url + "完成")
soup = BeautifulSoup(html, "html.parser")
div_list = soup.find_all('div', class_='list-card-layout full-mode-book')
# print(soup.find_all("script"))
data_list = []
# 把数据存入列表
for each in div_list:
bookInfo = each.find("div", class_="list-card-content")
# 封面
bookPicL = str(each.find("img"))
startL = bookPicL.find('src="') + len('src="')
endL = startL + bookPicL[startL:].find('"')
bookPicUrl = bookPicL[startL:endL]
# 书名
bookName = bookInfo.div.a.text
# 作者
author = bookInfo.find("p").find('a', class_='author-name ellipsis').text
# 标签
bookLabel = bookInfo.find("p", class_="bookinfo-tags").text.replace("本书标签:", "").replace(" ", "").replace("\r",
"").replace(
'\n', '')
# 字数
wordCount = bookInfo.p.span.text
info_list = [bookName, author, bookLabel, wordCount, bookPicUrl]
data_list.append(info_list)
# 设置休眠时间
# time.sleep(1)
return data_list
def run(urls):
for url in urls:
save_to_csv(csvPath, get_info(url))
# 程序主入口
if __name__ == '__main__':
with open(csvPath, 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f, dialect='excel')
writer.writerow(["bookName", "author", "bookLabel", "wordCount", "bookPicUrl"])
urls1 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
urls2 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
urls3 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
urls4 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
# 开启4个线程,传入爬取的地址
thead_list = []
t1 = Thread(target=run(urls1))
t1.start()
t2 = Thread(target=run(urls2))
t2.start()
t3 = Thread(target=run(urls3))
t3.start()
t4 = Thread(target=run(urls4))
t4.start()
thead_list.append(t1)
thead_list.append(t2)
thead_list.append(t3)
thead_list.append(t4)
for t in thead_list:
t.join()
随机UA&代理池
[hide reply_to_this="true"]
# -*- coding: utf-8 -*-
import urllib.request
import random
import time
import requests
proxy_list = []
def get_proxy_list():
global proxy_list
print("导入proxy_list...")
# print"导入proxy_list...".decode('utf-8')
# ip文件可以浏览我上文链接文章“多线程爬虫——抓取代理ip”
f = open("ip.txt")
# 从文件中读取的line会有回车,要把\n去掉
line = f.readline().strip('\n')
while line:
proxy_list.append(line)
line = f.readline().strip('\n')
f.close()
def start():
# 总次数和有效次数
times = 0
finished_times = 0
# 无限刷
while 1:
user_agent_list = [
{'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0)'},
{'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)'},
{'Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)'},
{'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11'},
{'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1'},
{'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER'},
{'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)'},
{'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0'},
{'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36'},
{'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'}]
referer_list = [
{'http://blog.csdn.net/dala_da/article/details/79401163'},
{'http://blog.csdn.net/'},
{'https://www.sogou.com/tx?query=%E4%BD%BF%E7%94%A8%E7%88%AC%E8%99%AB%E5%88%B7csdn%E8%AE%BF%E9%97%AE%E9%87%8F&hdq=sogou-site-706608cfdbcc1886-0001&ekv=2&ie=utf8&cid=qb7.zhuye&'},
{'https://www.baidu.com/s?tn=98074231_1_hao_pg&word=%E4%BD%BF%E7%94%A8%E7%88%AC%E8%99%AB%E5%88%B7csdn%E8%AE%BF%E9%97%AE%E9%87%8F'}]
# 想要刷的blog的url
url = 'http://blog.csdn.net/dala_da/article/details/79401163'
# 随机user_agent和Referer
header = {'User-Agent': random.choice(user_agent_list),
'Referer': random.choice(referer_list)
}
# 依次从proxy_list中取
ip = proxy_list[times % len(proxy_list)]
# 设置代理,格式如下
proxy_ip = 'http://' + ip
proxy_ips = 'https://' + ip
proxy = {'https': proxy_ips, 'http': proxy_ip}
try:
response = requests.get(url, headers=header, proxies=proxy)
except:
# 无响应则print出该代理ip
print('代理出问题啦')
# '代理出问题啦:'.decode('utf-8') + proxy["https"]
time.sleep(0.1)
else:
print('已刷%d次,%s') % (finished_times + 1, proxy["https"])
time.sleep(random.random())
finished_times += 1
times += 1
# 每当所有的代理ip刷过一轮,延时15秒
if not times % len(proxy_list):
time.sleep(15)
if __name__ == "__main__":
get_proxy_list()
start()
[/hide]
文章评论