当前位置:网站首页>哔哩哔哩视频爬取源码分享

哔哩哔哩视频爬取源码分享

2021-02-23 17:46:39 谜一样的青年

背景:

  无意间发现B站有个老师的课程特别好(python教学的视频),单位的网络限制了视频网站访问,所以尝试着去把视频下载保存起来,经过一段时间的研究终于完成代码的开发,大家有需要的可以搞下来,后续我会进行一个延伸优化,争取做到通过前端页面的视频名称输入作为爬取条件进行下载。

 

第一版;

#_author_='Lucky';
#date: 2021/2/18
import win32gui
import win32con
import win32api
import sys,os
import pynput,time
from time import sleep
from pywinauto import application
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

def man_dowload(url):
try:
bili_browser.get(api_url)
sleep(25)
bili_browser.find_element_by_xpath("//*[@placeholder='输入地址']").clear()
sleep(5)
bili_browser.find_element_by_xpath("//*[@placeholder='输入地址']").send_keys(url)
sleep(3)
bili_browser.find_element_by_id('button-1').click()
element2 = bili_browser.find_element_by_xpath("//a[contains(text(),'MP4地址')]")
sleep(2)
ActionChains(bili_browser).key_down(u'\ue00a').click(element2).perform()
#file_name_save(file_name)
ActionChains(bili_browser).key_up(u'\ue00a')
#bili_browser.find_element_by_xpath("//a[contains(text(),'MP4地址')]").send_keys(u'\ue00a')
save_as_window()
except Exception as e:
print(e)
#下面的chrome_options_setting直接引用就可以,不用动
def chrome_options_setting(web_driver):
"""
设置 Chrome Browser 的下载前询问每个文件的保存位置选项为打开(true)
:param web_driver: 浏览器驱动
:return: None
"""
web_driver.get("chrome://settings/downloads")
time.sleep(2)
web_element = web_driver.find_element_by_xpath("//settings-ui")
shadowRoot = web_driver.execute_script("return arguments[0].shadowRoot", web_element)
# shadowRoot 节点下不能使用 xpath 选择器
web_element = shadowRoot.find_element_by_id("container").find_element_by_id("main")
shadowRoot = web_driver.execute_script("return arguments[0].shadowRoot", web_element)
web_element = shadowRoot.find_element_by_css_selector("settings-basic-page[role='main']")
shadowRoot = web_driver.execute_script("return arguments[0].shadowRoot", web_element)
web_element = shadowRoot.find_element_by_css_selector("settings-downloads-page")
shadowRoot = web_driver.execute_script("return arguments[0].shadowRoot", web_element)
web_element = shadowRoot.find_element_by_css_selector("settings-toggle-button")
shadowRoot = web_driver.execute_script("return arguments[0].shadowRoot", web_element)
result = shadowRoot.find_element_by_css_selector(
"#outerRow > cr-toggle[aria-describedby='sub-label-text']").get_attribute("aria-pressed")
if result == "false":
shadowRoot.find_element_by_css_selector("#outerRow > cr-toggle[aria-describedby='sub-label-text']").click()
#下面方法是根据循环生成的xpath元素找到视频的名称,并返回文件名称给save_as_windows使用
def file_name_save():
try:
bili_browser.get(url)
sleep(15)
file_name=bili_browser.find_element_by_xpath(file_name_xpath).text
print(file_name)
return (file_name)
except Exception as e:
print(e)
#下面是完成对话框文件名输入和保存的动作
def save_as_window():
end_file_name=file_name_save()
app = application.Application().connect(title_re=u"另存为", class_name="#32770")
save_as_spec = app.window(title=u"另存为", class_name="#32770")
#print(save_as_spec.print_control_identifiers())

edit = save_as_spec["Edit"]
edit.set_text(end_file_name) # 第一种方法是直接设置edit的text,把file_name_save()的返回值作为文件名称赋值到文件名文本框;
    #edit.type_keys(file_name_save(), with_spaces=True)  # 第二种是在里面模拟键盘输入(如果字符串中没有空格,可以省略后面的参数),殊途同归

app['另存为']['保存(&S)'].click()

"""
|
| ComboBox - 'notes.txt' (L536, T675, R1188, B700)
| ['ComboBox', '另存为ComboBox', 'ComboBox0', 'ComboBox1', '另存为ComboBox0', '另存为ComboBox1']
| child_window(title="notes.txt", class_name="ComboBox")
| |
| | Edit - 'notes.txt' (L539, T678, R1168, B697)
| | ['另存为Edit', 'Edit', '另存为Edit0', '另存为Edit1', 'Edit0', 'Edit1']
| | child_window(title="notes.txt", class_name="Edit")
|
| Edit - 'notes.txt' (L539, T678, R1168, B697)
| ['另存为Edit', 'Edit', '另存为Edit0', '另存为Edit1', 'Edit0', 'Edit1']
| child_window(title="notes.txt", class_name="Edit")
"""


if __name__ == "__main__":
bili_browser = webdriver.Chrome()
api_url = 'https://xbeibeix.com/api/bilibili'
bili_browser.maximize_window()
bili_browser.get(api_url)
# 设置Chrome浏览器下载前询问每个文件的保存位置选项
chrome_options_setting(web_driver=bili_browser)
time.sleep(3)
for i in range(442, 634):
url = 'https://www.bilibili.com/video/BV197411G75w?p=' + str(i)
file_name_xpath = '//*[@id="multi_page"]/div[2]/ul/li[%d]/a/div/div[1]/span[2]' % i
print(file_name_xpath)
#print(url)
# 处理Windows10 系统“另存为”窗口,并修改保存文件名称
man_dowload(url)
time.sleep(5)
bili_browser.quit()

版权声明
本文为[谜一样的青年]所创,转载请带上原文链接,感谢
https://www.cnblogs.com/pytest/p/14437319.html

随机推荐