python3 beautifulsoup配合requests下载老司机图片[通俗易懂]

python3 beautifulsoup配合requests下载老司机图片[通俗易懂]学python爬虫也一个星期了,写个博客总结下。我爬虫用的库是beautifulsoup和requests。确实很适合新手使用,认真花一天时间就能上手爬点东西了。代码如下frommultiprocessingimportPoolfrombs4importBeautifulSoupimportrequestsimportosimportthreadingfr...

学python爬虫也一个星期了,写个博客总结下。

我爬虫用的库是beautifulsoup和requests。确实很适合新手使用,认真花一天时间就能上手爬点东西了。

代码如下

from multiprocessing import Pool
from bs4 import BeautifulSoup
import requests
import os
import threading
from hashlib import md5
url='https://www.aitaotu.com/tag/aiss.html'
headers={
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
	'Referer': 'https://www.aitaotu.com/tag/aiss.html',
}

def get_pages(url):
	images_links=[]
	r=requests.get(url=url,headers=headers,verify=True)
	r.encoding='utf-8'
	bf=BeautifulSoup(r.text,'lxml').find('ul',id='mainbodypul').find_all('a',class_='Pli-litpic')
	for each in bf:
		base_url='https://www.aitaotu.com'+each['href']
		images_links.append(base_url)
	return images_links

def get_images(images_links):
	#images_src=[]
	for link in images_links:
		headers['Referer']=link
		r=requests.get(url=link,headers=headers,verify=True)
		bf=BeautifulSoup(r.text,'lxml')
		totalpage=bf.find('span',id='picnum').find(class_='totalpage').get_text()
		title=bf.find('h2').get_text()
		path=title.strip()
		file_path=save_images(path)
		img_url=link[:-5]+'_%s.html'%totalpage
		headers['Referer']=img_url
		response=requests.get(url=img_url,headers=headers,verify=True)
		soup=BeautifulSoup(response.text,'lxml').find('p',align='center').find_all('a')
		for each in soup:
			src=each.img['src']
		num=src[-6:-4]
		base_link=src[:-6]
		try:
			num=int(num)
		except:
			img_url=link[:-5]+'_%d.html'%(int(totalpage)-1)
			headers['Referer']=img_url
			response=requests.get(url=img_url,headers=headers,verify=True)
			soup=BeautifulSoup(response.text,'lxml').find('p',align='center').find_all('a')
			for each in soup:
				src=each.img['src']
			num=int(src[-6:-4])
		yield (base_link,num,file_path)

def save_images(path):
	file_path=os.path.join('/Users/zhengqiteng/Desktop/zqt/aiss/',path)
	isExists = os.path.exists(os.path.join('/Users/zhengqiteng/Desktop/zqt/aiss/',path))
	if not isExists:
		print(u'建了一个名字叫做', path, u'的文件夹!')
		os.makedirs(os.path.join("/Users/zhengqiteng/Desktop/zqt/aiss/", path))
		os.chdir(os.path.join("/Users/zhengqiteng/Desktop/zqt/aiss/", path))
	else:
		os.chdir(os.path.join("/Users/zhengqiteng/Desktop/zqt/aiss/", path))
	return file_path

def thread_download(base_link,num,file_path):
	for i in range(1,int(num)+1):
		if i<10:
			need2down=base_link+'0%d.jpg'%i
		else:
			need2down=base_link+'%d.jpg'%i
		res=requests.get(url=need2down)
		print('获取网页成功')
		with open(file_path+'/%d.jpg'%i,'wb') as f:
			f.write(res.content)
			f.close()
		print(file_path,'第%d张下载完成'%i)

if __name__ == '__main__':
	#print(get_pages(url))
	#a=get_pages(url)
	#for each in get_images(a):
	thread_download('https://img.aitaotu.cc:8089/Pics/2016/1106/01/',18, '/Users/zhengqiteng/Desktop/zqt/aiss/[AISS爱丝] 经典丝袜美腿外拍 第145期 丽莎de新玩具')
		
只听到从架构师办公室传来架构君的声音:
当年万里觅封侯,匹马戍梁州。有谁来对上联或下联?

 

本文来源drawAgirl,由架构君转载发布,观点不代表Java架构师必看的立场,转载请标明来源出处:https://javajgs.com/archives/222540
0
 

发表评论