使用Selenium爬取B站指定页面下的所有视频-天翼云开发者社区

在我们获取训练数据的过程中，有时候我们需要从B站下载一些视频，我们通常是使用关键字进行搜索，这时会出来一些包含关键字的页面，我写了一个小工具，可以对当前页面下的所有视频进行爬取并保存，代码如下：

from selenium import webdriver
from selenium.webdriver.common.keys import Keys #需要引入 keys 包
from selenium.webdriver.common.by import By
import time
import os
import requests
import urllib
from bs4 import BeautifulSoup
import re
import functools
from threading import Thread

def timeout(timeout):
    def deco(func):
        @functools.wraps(func)
        def wrapper(*args,**kwargs):
            res = [Exception('function [%s] timeout [%s seconds] exceeded!' % (func.__name__,timeout))]
            def newFunc():
                try:
                    res[0] = func(*args,**kwargs)
                except Exception as e:
                    res[0] = e
            t = Thread(target=newFunc)
            t.daemon = True
            try:
                t.start()
                t.join(timeout)
            except Exception as je:
                print('error starting thread')
                raise je
            ret = res[0]
            if isinstance(ret,BaseException):
                raise ret
            return ret
        return wrapper
    return deco


def save_video(video_url):
    command = 'you-get {}'.format(video_url)
    os.system(command)


url = 'https://search.bilibili.com/all?keyword=%E7%AC%AC%E4%B8%89%E5%B1%8A%E6%98%9F%E7%81%AB%E6%9D%AF%E7%83%9B%E5%85%89%E8%B5%9B%E5%8C%BA&page=4' #要下载页面的网址
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
code = open('bzhan_url_1130.txt', 'a', encoding = 'utf-8') #首先保存视频链接，以免一边爬取一边下载出错

browser = webdriver.Chrome(options=options)
browser.get(url)
browser.maximize_window()

browser.refresh()
time.sleep(4)

browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
for j in range(12):
    i = int(url.split('=')[-1]) + 1
    if i==0:
        xpath = '/html/body/div[3]/div/div[2]/div/div[1]/div[2]/ul/li[{}]/a'.format(j+1)
    else:
        xpath = '/html/body/div[3]/div/div[2]/div/div[1]/ul/li[{}]/a'.format(j+1)
    try:
        src_list = browser.find_element_by_xpath(xpath)
        video_path = src_list.get_attribute('href')
        # save_video(video_path)
        code.write(video_path+'\n')
        code.flush()
    except Exception as e:
        print('此页面没有更多内容了...')
        break
code.close()
browser.close()

需要注意的点如下：

1、需要下载you-get工具，直接使用pip或者conda安装即可

2、我是采用先保存视频链接再下载的方法，所以需要在爬取视频链接过后再次调用save_video()函数下载视频

from selenium import webdriver from selenium.webdriver.common.keys import Keys #需要引入 keys 包 from selenium.webdriver.common.by import By import time import os import requests import urllib from bs4 import BeautifulSoup import re import functools from threading import Thread def timeout(timeout): def deco(func): @functools.wraps(func) def wrapper(*args,**kwargs): res = [Exception('function [%s] timeout [%s seconds] exceeded!' % (func.__name__,timeout))] def newFunc(): try: res[0] = func(*args,**kwargs) except Exception as e: res[0] = e t = Thread(target=newFunc) t.daemon = True try: t.start() t.join(timeout) except Exception as je: print('error starting thread') raise je ret = res[0] if isinstance(ret,BaseException): raise ret return ret return wrapper return deco def save_video(video_url): command = 'you-get {}'.format(video_url) os.system(command) url = 'https://search.bilibili.com/all?keyword=%E7%AC%AC%E4%B8%89%E5%B1%8A%E6%98%9F%E7%81%AB%E6%9D%AF%E7%83%9B%E5%85%89%E8%B5%9B%E5%8C%BA&page=4' #要下载页面的网址 options = webdriver.ChromeOptions() options.add_experimental_option('excludeSwitches', ['enable-logging']) code = open('bzhan_url_1130.txt', 'a', encoding = 'utf-8') #首先保存视频链接，以免一边爬取一边下载出错 browser = webdriver.Chrome(options=options) browser.get(url) browser.maximize_window() browser.refresh() time.sleep(4) browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') for j in range(12): i = int(url.split('=')[-1]) + 1 if i==0: xpath = '/html/body/div[3]/div/div[2]/div/div[1]/div[2]/ul/li[{}]/a'.format(j+1) else: xpath = '/html/body/div[3]/div/div[2]/div/div[1]/ul/li[{}]/a'.format(j+1) try: src_list = browser.find_element_by_xpath(xpath) video_path = src_list.get_attribute('href') # save_video(video_path) code.write(video_path+'\n') code.flush() except Exception as e: print('此页面没有更多内容了...') break code.close() browser.close()

息壤智算

应用商城

定价

合作伙伴

开发者

支持与服务

了解天翼云

使用Selenium爬取B站指定页面下的所有视频

使用Selenium爬取B站指定页面下的所有视频

活动

息壤智算

应用商城

定价

合作伙伴

开发者

支持与服务

了解天翼云

使用Selenium爬取B站指定页面下的所有视频

使用Selenium爬取B站指定页面下的所有视频