[hide reply_to_this="true"]
import random
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import http.cookiejar
import re
import operator
from functools import reduce
import csv
import requests
webUrl = 'http://www.bookschina.com'
csvPath = 'C:/tool/dev/python/PycharmProjects/day0805_work01/bookschina/data/bookchina.csv'
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Opera/8.0 (Windows NT 5.1; U; en)",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
]
# 获取代理池中的代理
def get_proxy():
return requests.get("http://118.24.52.95/get/").json()
# 删除代理池中代理
def delete_proxy(proxy):
requests.get("http://118.24.52.95/delete/?proxy={}".format(proxy))
def getCPageUrl(cUrl, num):
curl = webUrl + cUrl.rstrip('/') + '_0_0_11_0_1_' + str(num) + '_0_0'
return curl
def save_to_csv(csvPath, lst):
with open(csvPath, 'a+', encoding='utf-8', newline='') as f:
writer = csv.writer(f, dialect='excel')
for info_list in lst:
writer.writerow(info_list)
class ThreadPageLinkToGetData():
def getInfo(self, cUrlN, bookNUrl):
data_list = []
headers = {'User-Agent': random.choice(user_agent_list)}
httpproxy_handler = urllib.request.ProxyHandler({'http': get_proxy().get("proxy")})
bookOpener = urllib.request.build_opener(httpproxy_handler)
head = []
for key, value in headers.items():
elem = (key, value)
head.append(elem)
opener.addheaders = head
bookOpener.addheaders = head
for bookName in bookNUrl:
url = webUrl + bookNUrl[bookName]
print(url)
html = bookOpener.open(url).read().decode("gb18030")
soup = BeautifulSoup(html, "html.parser")
# 封面
bookPic = soup.find('img', class_='jqzoom')['src']
# 书名
bookName = soup.find('div', class_='padLeft10').h1.string
# 作者
author = soup.find('div', class_='author').a.string
# 出版社
publisher = soup.find('div', class_='publisher').a.string.replace(' ', '')
# 出版时间
publisherTime = soup.find('div', class_='publisher').i.string
# 所属丛书
try:
SeriesBook = soup.find('div', class_='series').a.string.replace(' ', '')
except AttributeError as ae:
SeriesBook = '暂无'
# 开本
try:
bookSize = soup.find('div', class_='otherInfor').em.string.replace(' ', '')
except AttributeError as nt:
bookSize = '暂无'
# 页数
try:
bookPage = soup.find('div', class_='otherInfor').i.string.replace(' ', '')
except AttributeError as nt:
bookPage = '暂无'
# 价格
bookPrice = soup.find('del', class_='price').string.replace(' ', '')
infos = soup.find('div', id='copyrightInfor').ul
bookISBN = infos.select('li')[0].string.split(':')[1].replace(' ', '')
# 条形码
barCode = infos.select('li')[1].string.split(':')[1].replace(' ', '')
# 装帧
binding = infos.select('li')[2].string.split(':')[1].replace(' ', '')
# 版次
editon = infos.select('li')[3].string.split(':')[1].replace(' ', '')
# 册数
volumes = infos.select('li')[4].string.split(':')[1].replace(' ', '')
# 重量
weight = infos.select('li')[5].string.split(':')[1].replace(' ', '')
# 印刷次数
printingVolume = infos.select('li')[6].string.split(':')[1].replace(' ', '')
# 分类
classifyAs = infos.find('li', class_='kind').find_all('a')
classify = ''
for classA in classifyAs:
classify = classify + '&' + classA.string
classify = classify.lstrip('&')
info_list = [bookPic, bookName, author, publisher, publisherTime, SeriesBook, bookSize, bookPage, bookPrice,
bookISBN,
barCode, binding, editon, volumes, weight, printingVolume, classify, cUrlN]
data_list.append(info_list)
save_to_csv(csvPath, data_list)
class _SecondPageLinkToGetUrl():
def __init__(self, opener):
self.opener = opener
def getPage(self, url):
html = self.opener.open(url).read().decode("gb18030")
soup = BeautifulSoup(html, "html.parser")
regex = re.compile("^[0-9]*$")
valueNumF = soup.find('div', class_='paging').find_all('a')
numList = []
for value in valueNumF:
numList.append(regex.findall(value.next_element))
numList = list(map(int, reduce(operator.add, numList)))
return max(numList)
def getBookUrl(self, url):
html = self.opener.open(url).read().decode("gb18030")
soup = BeautifulSoup(html, "html.parser")
bookName = soup.find('div', class_='bookList').find_all('h2', class_="name")
dictNameUrl = {}
for bHN2 in bookName:
dictNameUrl[bHN2.string] = bHN2.a['href']
return dictNameUrl
class _FirstPageLinkToGetUrl():
def __init__(self, opener, url):
self.opener = opener
self.url = url
def getDifferentSeriesBookUrl(self):
html = self.opener.open(self.url).read().decode("gb18030")
soup = BeautifulSoup(html, "html.parser")
# 字典存储小类别对应的URL
dictUrl = {}
for _li in soup.find('div', class_='categoriesList').find('div', class_='w1200 clearfix').find_all('li'):
_a = _li.find('a')
dictUrl[_a.string] = _a['href']
return dictUrl
with open(csvPath, 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f, dialect='excel')
writer.writerow(
['bookPic', 'bookName', 'author', 'publisher', 'publisherTime', 'SeriesBook', 'bookSize', 'bookPage',
'bookPrice'
, 'bookISBN', 'barCode', 'binding' 'editon', 'volumes', 'weight', 'printingVolume', 'classify', 'bClass'])
# url,网所有商品网页
url = 'http://www.bookschina.com/books/kinder/'
# 创建实例化对象
Cookie = http.cookiejar.CookieJar()
# 创建处理器
CookieHandle = urllib.request.HTTPCookieProcessor(Cookie)
# 创建opener
opener = urllib.request.build_opener(CookieHandle)
# 模拟浏览器登录
header = \
{
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.52"
}
head = []
for key, value in header.items():
elem = (key, value)
head.append(elem)
opener.addheaders = head
# 打开一次网页让opener具备Cookie
opener.open(url)
_fpl = _FirstPageLinkToGetUrl(opener, url)
# 获取各个类别及对应url
dictUrl = _fpl.getDifferentSeriesBookUrl()
_spl = _SecondPageLinkToGetUrl(opener)
_tpl = ThreadPageLinkToGetData()
for cUrlN in dictUrl:
cNum = _spl.getPage(getCPageUrl(dictUrl[cUrlN], 1))
for num in range(1, cNum + 1):
bookNUrl = _spl.getBookUrl(getCPageUrl(dictUrl[cUrlN], num))
_tpl.getInfo(cUrlN, bookNUrl)
[/hide]
文章评论