我们平常使用python爬取bing的图片时,返回的都是小图,无法用于我们平时算法开发,于是我开发了一个使用selenium模拟我们平时打开浏览器爬虫的脚本,可以自定义输入关键字,并将图片的链接保存至txt中,防止下载图片出现错误,主要代码如下:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys #需要引入 keys 包
import time
import os
import requests
import urllib
from bs4 import BeautifulSoup
import re
import functools
from threading import Thread
import pypinyin
from datetime import datetime
def timeout(timeout):
def deco(func):
@functools.wraps(func)
def wrapper(*args,**kwargs):
res = [Exception('function [%s] timeout [%s seconds] exceeded!' % (func.__name__,timeout))]
def newFunc():
try:
res[0] = func(*args,**kwargs)
except Exception as e:
res[0] = e
t = Thread(target=newFunc)
t.daemon = True
try:
t.start()
t.join(timeout)
except Exception as je:
print('error starting thread')
raise je
ret = res[0]
if isinstance(ret,BaseException):
raise ret
return ret
return wrapper
return deco
@timeout(5)
def save_img(count, img_src, picpath, name):
dt = datetime.now()
dt = dt.strftime("%Y_%m_%d")
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Mobile Safari/537.36'
}
filename = os.path.join(picpath,'bing_{}_{}_{}.jpg'.format(name, dt, count))
if os.path.exists(filename):
return
r = requests.get(img_src, headers=headers, timeout=(3, 7))
with open(filename, 'wb') as f:
f.write(r.content)
f.close()
print(filename, 'saved..')
url = 'https://cn.bing.com/images'
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
pos = 0
picpath = r'D:\AI项目\自然场景OCR'
os.makedirs(picpath, exist_ok=True)
error_count = 0
#主要代码
keys = ['粤康码', '穗康码', '新疆健康码']
for key in keys:
count = 0
name = pypinyin.pinyin(key, style=pypinyin.NORMAL)
name = [n[0] for n in name]
name = ''.join(name)
cur_picpath = os.path.join(picpath, key)
os.makedirs(cur_picpath, exist_ok=True)
print(f'processing {key} 中....')
browser = webdriver.Chrome(options=options)
browser.get(url)
browser.maximize_window()
browser.find_element_by_id("sb_form_q").clear()
browser.find_element_by_id("sb_form_q").send_keys(key, Keys.ENTER)
browser.implicitly_wait(2)
browser.find_element_by_class_name("fltIdtTit").click() # 点击筛选
browser.implicitly_wait(1)
browser.find_element_by_xpath('/html/body/div[3]/div[2]/div/ul/li[1]/span/span').click() # 点击图片尺寸
browser.implicitly_wait(1)
browser.find_element_by_xpath('/html/body/div[3]/div[2]/div/ul/li[1]/div/div/a[3]/span').click() # 选择中
browser.implicitly_wait(1)
t = True
i= 0
while t:
check_height = browser.execute_script("return document.body.scrollHeight;")
for i in range(20): #滚动页面次数
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
# pos += 5000 # 下拉滚动条
time.sleep(1)
check_height1 = browser.execute_script("return document.body.scrollHeight;")
print(check_height, check_height1)
if check_height==check_height1:
t = False
allhtml = browser.page_source # 获取页面源代码
browser.quit() # 拿到页面源代码后关闭浏览器
# print(allhtml)
bs = BeautifulSoup(allhtml, 'lxml')
list01 = bs.find_all('a', class_='iusc') # 获取所有class='iusc'的<a>标签
src_list = re.findall(r'\"murl\"\:\"(.*?)\"\,\"', str(list01)) # 正则匹配图片地址
print(len(src_list))
for i, img_url in enumerate(src_list):
try:
save_img(count, img_url, cur_picpath, name)
count += 1
except:
continue
该方法通过模拟我们手动打开浏览器,并输入关键字进行搜索,再将页面拉到最底下,获取当前页面的HTML源代码进行分析,获取图片链接并下载。在我们实际项目中有着广泛的应用。