python3 beautifulsoup配合requests下载老司机图片[通俗易懂]
python3 beautifulsoup配合requests下载老司机图片[通俗易懂]学python爬虫也一个星期了,写个博客总结下。我爬虫用的库是beautifulsoup和requests。确实很适合新手使用,认真花一天时间就能上手爬点东西了。代码如下frommultiprocessingimportPoolfrombs4importBeautifulSoupimportrequestsimportosimportthreadingfr...
学python爬虫也一个星期了,写个博客总结下。
我爬虫用的库是beautifulsoup和requests。确实很适合新手使用,认真花一天时间就能上手爬点东西了。
代码如下
from multiprocessing import Pool
from bs4 import BeautifulSoup
import requests
import os
import threading
from hashlib import md5
url='https://www.aitaotu.com/tag/aiss.html'
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
'Referer': 'https://www.aitaotu.com/tag/aiss.html',
}
def get_pages(url):
images_links=[]
r=requests.get(url=url,headers=headers,verify=True)
r.encoding='utf-8'
bf=BeautifulSoup(r.text,'lxml').find('ul',id='mainbodypul').find_all('a',class_='Pli-litpic')
for each in bf:
base_url='https://www.aitaotu.com'+each['href']
images_links.append(base_url)
return images_links
def get_images(images_links):
#images_src=[]
for link in images_links:
headers['Referer']=link
r=requests.get(url=link,headers=headers,verify=True)
bf=BeautifulSoup(r.text,'lxml')
totalpage=bf.find('span',id='picnum').find(class_='totalpage').get_text()
title=bf.find('h2').get_text()
path=title.strip()
file_path=save_images(path)
img_url=link[:-5]+'_%s.html'%totalpage
headers['Referer']=img_url
response=requests.get(url=img_url,headers=headers,verify=True)
soup=BeautifulSoup(response.text,'lxml').find('p',align='center').find_all('a')
for each in soup:
src=each.img['src']
num=src[-6:-4]
base_link=src[:-6]
try:
num=int(num)
except:
img_url=link[:-5]+'_%d.html'%(int(totalpage)-1)
headers['Referer']=img_url
response=requests.get(url=img_url,headers=headers,verify=True)
soup=BeautifulSoup(response.text,'lxml').find('p',align='center').find_all('a')
for each in soup:
src=each.img['src']
num=int(src[-6:-4])
yield (base_link,num,file_path)
def save_images(path):
file_path=os.path.join('/Users/zhengqiteng/Desktop/zqt/aiss/',path)
isExists = os.path.exists(os.path.join('/Users/zhengqiteng/Desktop/zqt/aiss/',path))
if not isExists:
print(u'建了一个名字叫做', path, u'的文件夹!')
os.makedirs(os.path.join("/Users/zhengqiteng/Desktop/zqt/aiss/", path))
os.chdir(os.path.join("/Users/zhengqiteng/Desktop/zqt/aiss/", path))
else:
os.chdir(os.path.join("/Users/zhengqiteng/Desktop/zqt/aiss/", path))
return file_path
def thread_download(base_link,num,file_path):
for i in range(1,int(num)+1):
if i<10:
need2down=base_link+'0%d.jpg'%i
else:
need2down=base_link+'%d.jpg'%i
res=requests.get(url=need2down)
print('获取网页成功')
with open(file_path+'/%d.jpg'%i,'wb') as f:
f.write(res.content)
f.close()
print(file_path,'第%d张下载完成'%i)
if __name__ == '__main__':
#print(get_pages(url))
#a=get_pages(url)
#for each in get_images(a):
thread_download('https://img.aitaotu.cc:8089/Pics/2016/1106/01/',18, '/Users/zhengqiteng/Desktop/zqt/aiss/[AISS爱丝] 经典丝袜美腿外拍 第145期 丽莎de新玩具')
只听到从架构师办公室传来架构君的声音:
当年万里觅封侯,匹马戍梁州。有谁来对上联或下联?
当年万里觅封侯,匹马戍梁州。有谁来对上联或下联?