searchusermenu
  • 发布文章
  • 消息中心
点赞
收藏
评论
分享
原创

selenium爬取必应高清大图

2023-05-25 09:08:01
29
0

我们平常使用python爬取bing的图片时,返回的都是小图,无法用于我们平时算法开发,于是我开发了一个使用selenium模拟我们平时打开浏览器爬虫的脚本,可以自定义输入关键字,并将图片的链接保存至txt中,防止下载图片出现错误,主要代码如下:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys #需要引入 keys 包
import time
import os
import requests
import urllib
from bs4 import BeautifulSoup
import re
import functools
from threading import Thread
import pypinyin
from datetime import  datetime

def timeout(timeout):
    def deco(func):
        @functools.wraps(func)
        def wrapper(*args,**kwargs):
            res = [Exception('function [%s] timeout [%s seconds] exceeded!' % (func.__name__,timeout))]
            def newFunc():
                try:
                    res[0] = func(*args,**kwargs)
                except Exception as e:
                    res[0] = e
            t = Thread(target=newFunc)
            t.daemon = True
            try:
                t.start()
                t.join(timeout)
            except Exception as je:
                print('error starting thread')
                raise je
            ret = res[0]
            if isinstance(ret,BaseException):
                raise ret
            return ret
        return wrapper
    return deco




@timeout(5)
def save_img(count, img_src, picpath, name):
    dt = datetime.now()
    dt = dt.strftime("%Y_%m_%d")
    headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Mobile Safari/537.36'
        }
    filename = os.path.join(picpath,'bing_{}_{}_{}.jpg'.format(name, dt, count))
    if os.path.exists(filename): 
        return
    r = requests.get(img_src, headers=headers, timeout=(3, 7))
    with open(filename, 'wb') as f:
        f.write(r.content)
    f.close()
    print(filename, 'saved..')


url = 'https://cn.bing.com/images'
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])

    
pos = 0
picpath = r'D:\AI项目\自然场景OCR'
os.makedirs(picpath, exist_ok=True)
error_count = 0

#主要代码

keys = ['粤康码', '穗康码', '新疆健康码']
for key in keys:
    count = 0
    name = pypinyin.pinyin(key, style=pypinyin.NORMAL)
    name = [n[0] for n in name]
    name = ''.join(name)
    cur_picpath = os.path.join(picpath, key)
    os.makedirs(cur_picpath, exist_ok=True)
    print(f'processing {key} 中....')
    browser = webdriver.Chrome(options=options)
    browser.get(url)
    browser.maximize_window()
    browser.find_element_by_id("sb_form_q").clear()
    browser.find_element_by_id("sb_form_q").send_keys(key, Keys.ENTER)
    browser.implicitly_wait(2)
    
    browser.find_element_by_class_name("fltIdtTit").click()  # 点击筛选
    browser.implicitly_wait(1)

    browser.find_element_by_xpath('/html/body/div[3]/div[2]/div/ul/li[1]/span/span').click()  # 点击图片尺寸
    browser.implicitly_wait(1)

    browser.find_element_by_xpath('/html/body/div[3]/div[2]/div/ul/li[1]/div/div/a[3]/span').click()  # 选择中
    browser.implicitly_wait(1)
    t = True
    i= 0
    while t:
        check_height = browser.execute_script("return document.body.scrollHeight;")
        for i in range(20):  #滚动页面次数
            browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
            # pos += 5000 # 下拉滚动条
            time.sleep(1)
        check_height1 = browser.execute_script("return document.body.scrollHeight;")
        print(check_height, check_height1)
        if check_height==check_height1:
            t = False
    allhtml = browser.page_source  # 获取页面源代码
    browser.quit()  # 拿到页面源代码后关闭浏览器
    # print(allhtml)
    bs = BeautifulSoup(allhtml, 'lxml')
    list01 = bs.find_all('a', class_='iusc')  # 获取所有class='iusc'的<a>标签
    src_list = re.findall(r'\"murl\"\:\"(.*?)\"\,\"', str(list01))  # 正则匹配图片地址
    print(len(src_list))
    for i, img_url in enumerate(src_list):
        try:
            save_img(count, img_url, cur_picpath, name)
            count += 1
        except:
            continue

该方法通过模拟我们手动打开浏览器,并输入关键字进行搜索,再将页面拉到最底下,获取当前页面的HTML源代码进行分析,获取图片链接并下载。在我们实际项目中有着广泛的应用。

0条评论
0 / 1000
凌****昆
4文章数
0粉丝数
凌****昆
4 文章 | 0 粉丝
原创

selenium爬取必应高清大图

2023-05-25 09:08:01
29
0

我们平常使用python爬取bing的图片时,返回的都是小图,无法用于我们平时算法开发,于是我开发了一个使用selenium模拟我们平时打开浏览器爬虫的脚本,可以自定义输入关键字,并将图片的链接保存至txt中,防止下载图片出现错误,主要代码如下:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys #需要引入 keys 包
import time
import os
import requests
import urllib
from bs4 import BeautifulSoup
import re
import functools
from threading import Thread
import pypinyin
from datetime import  datetime

def timeout(timeout):
    def deco(func):
        @functools.wraps(func)
        def wrapper(*args,**kwargs):
            res = [Exception('function [%s] timeout [%s seconds] exceeded!' % (func.__name__,timeout))]
            def newFunc():
                try:
                    res[0] = func(*args,**kwargs)
                except Exception as e:
                    res[0] = e
            t = Thread(target=newFunc)
            t.daemon = True
            try:
                t.start()
                t.join(timeout)
            except Exception as je:
                print('error starting thread')
                raise je
            ret = res[0]
            if isinstance(ret,BaseException):
                raise ret
            return ret
        return wrapper
    return deco




@timeout(5)
def save_img(count, img_src, picpath, name):
    dt = datetime.now()
    dt = dt.strftime("%Y_%m_%d")
    headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Mobile Safari/537.36'
        }
    filename = os.path.join(picpath,'bing_{}_{}_{}.jpg'.format(name, dt, count))
    if os.path.exists(filename): 
        return
    r = requests.get(img_src, headers=headers, timeout=(3, 7))
    with open(filename, 'wb') as f:
        f.write(r.content)
    f.close()
    print(filename, 'saved..')


url = 'https://cn.bing.com/images'
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])

    
pos = 0
picpath = r'D:\AI项目\自然场景OCR'
os.makedirs(picpath, exist_ok=True)
error_count = 0

#主要代码

keys = ['粤康码', '穗康码', '新疆健康码']
for key in keys:
    count = 0
    name = pypinyin.pinyin(key, style=pypinyin.NORMAL)
    name = [n[0] for n in name]
    name = ''.join(name)
    cur_picpath = os.path.join(picpath, key)
    os.makedirs(cur_picpath, exist_ok=True)
    print(f'processing {key} 中....')
    browser = webdriver.Chrome(options=options)
    browser.get(url)
    browser.maximize_window()
    browser.find_element_by_id("sb_form_q").clear()
    browser.find_element_by_id("sb_form_q").send_keys(key, Keys.ENTER)
    browser.implicitly_wait(2)
    
    browser.find_element_by_class_name("fltIdtTit").click()  # 点击筛选
    browser.implicitly_wait(1)

    browser.find_element_by_xpath('/html/body/div[3]/div[2]/div/ul/li[1]/span/span').click()  # 点击图片尺寸
    browser.implicitly_wait(1)

    browser.find_element_by_xpath('/html/body/div[3]/div[2]/div/ul/li[1]/div/div/a[3]/span').click()  # 选择中
    browser.implicitly_wait(1)
    t = True
    i= 0
    while t:
        check_height = browser.execute_script("return document.body.scrollHeight;")
        for i in range(20):  #滚动页面次数
            browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
            # pos += 5000 # 下拉滚动条
            time.sleep(1)
        check_height1 = browser.execute_script("return document.body.scrollHeight;")
        print(check_height, check_height1)
        if check_height==check_height1:
            t = False
    allhtml = browser.page_source  # 获取页面源代码
    browser.quit()  # 拿到页面源代码后关闭浏览器
    # print(allhtml)
    bs = BeautifulSoup(allhtml, 'lxml')
    list01 = bs.find_all('a', class_='iusc')  # 获取所有class='iusc'的<a>标签
    src_list = re.findall(r'\"murl\"\:\"(.*?)\"\,\"', str(list01))  # 正则匹配图片地址
    print(len(src_list))
    for i, img_url in enumerate(src_list):
        try:
            save_img(count, img_url, cur_picpath, name)
            count += 1
        except:
            continue

该方法通过模拟我们手动打开浏览器,并输入关键字进行搜索,再将页面拉到最底下,获取当前页面的HTML源代码进行分析,获取图片链接并下载。在我们实际项目中有着广泛的应用。

文章来自个人专栏
文章 | 订阅
0条评论
0 / 1000
请输入你的评论
0
0