在我们获取训练数据的过程中,有时候我们需要从B站下载一些视频,我们通常是使用关键字进行搜索,这时会出来一些包含关键字的页面,我写了一个小工具,可以对当前页面下的所有视频进行爬取并保存,代码如下:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys #需要引入 keys 包
from selenium.webdriver.common.by import By
import time
import os
import requests
import urllib
from bs4 import BeautifulSoup
import re
import functools
from threading import Thread
def timeout(timeout):
def deco(func):
@functools.wraps(func)
def wrapper(*args,**kwargs):
res = [Exception('function [%s] timeout [%s seconds] exceeded!' % (func.__name__,timeout))]
def newFunc():
try:
res[0] = func(*args,**kwargs)
except Exception as e:
res[0] = e
t = Thread(target=newFunc)
t.daemon = True
try:
t.start()
t.join(timeout)
except Exception as je:
print('error starting thread')
raise je
ret = res[0]
if isinstance(ret,BaseException):
raise ret
return ret
return wrapper
return deco
def save_video(video_url):
command = 'you-get {}'.format(video_url)
os.system(command)
url = 'https://search.bilibili.com/all?keyword=%E7%AC%AC%E4%B8%89%E5%B1%8A%E6%98%9F%E7%81%AB%E6%9D%AF%E7%83%9B%E5%85%89%E8%B5%9B%E5%8C%BA&page=4' #要下载页面的网址
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
code = open('bzhan_url_1130.txt', 'a', encoding = 'utf-8') #首先保存视频链接,以免一边爬取一边下载出错
browser = webdriver.Chrome(options=options)
browser.get(url)
browser.maximize_window()
browser.refresh()
time.sleep(4)
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
for j in range(12):
i = int(url.split('=')[-1]) + 1
if i==0:
xpath = '/html/body/div[3]/div/div[2]/div/div[1]/div[2]/ul/li[{}]/a'.format(j+1)
else:
xpath = '/html/body/div[3]/div/div[2]/div/div[1]/ul/li[{}]/a'.format(j+1)
try:
src_list = browser.find_element_by_xpath(xpath)
video_path = src_list.get_attribute('href')
# save_video(video_path)
code.write(video_path+'\n')
code.flush()
except Exception as e:
print('此页面没有更多内容了...')
break
code.close()
browser.close()
需要注意的点如下:
1、需要下载you-get工具,直接使用pip或者conda安装即可
2、我是采用先保存视频链接再下载的方法,所以需要在爬取视频链接过后再次调用save_video()函数下载视频