0.前言
很简单的代码,selenium模拟的部分跳过了登录需要手动登录。效果是获得收藏夹内的所有BV号——根据BV号下载的代码在第二部分。
为什么selenium模拟要用java写?这就是另一段故事了。总而言之这并没有什么难度。
1.selenium模拟
package org.example;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.google.gson.JsonArray;
import com.google.gson.annotations.JsonAdapter;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.edge.EdgeDriver;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
public class WebTest {
private static WebDriver browser;
public WebTest() {
//System.setProperty("webdriver.edge.driver", "D:\\BrowserDriver\\msedgedriver.exe");
this.browser = new ChromeDriver();
}
public static void switchTo(int num){
// 获取当前窗口的句柄
String handle = browser.getWindowHandle();
// 获取所有窗口的句柄,返回一个集合
Set Windows = browser.getWindowHandles();
//把获取到的窗口句柄放到list中
List allWindows = new ArrayList(Windows);
// 切换到窗口
browser.switchTo().window(allWindows.get(num));
}
public static void main(String[] args) throws InterruptedException, IOException {
String favorName = "冬雪莲";
WebTest webTest = new WebTest();
browser.manage().window().maximize();
//控制浏览器访问url地址
browser.get("https://www.bilibili.com");
Thread.sleep(15000);
switchTo(0);
browser.findElement(By.linkText("收藏")).click();
switchTo(1);
Thread.sleep(3000);
//WebElement music = browser.findElement(By.xpath("//*[@id=\"fav-createdList-container\"]/ul/li[13]/a"));
browser.findElement(By.linkText(favorName)).click();
WebElement music = browser.findElement(By.linkText(favorName));
String link = music.getAttribute("href");
String fid = link.substring(link.indexOf("fid=") + 4, link.indexOf("&ftype"));
System.out.println(fid);
//50442543
String pageNumString = browser.findElement(By.xpath("//*[@id=\"page-fav\"]/div[1]/div[2]/div[3]/ul[2]/span[1]")).getText();
System.out.println(pageNumString);
int pageNum = Integer.parseInt(pageNumString.substring(pageNumString.indexOf("共") + 2, pageNumString.indexOf("页") - 1));
System.out.println(pageNum);
String url_pattern = "https://api.bilibili.com/x/v3/fav/resource/list?media_id=%s&pn=%d&ps=20&keyword=&order=mtime&type=0&tid=0&platform=web";
//准备写入
FileWriter fileWriter = new FileWriter("BV.txt");
BufferedWriter bufferedWriter = new BufferedWriter(fileWriter);
for (int count = 1; count <= pageNum; count++) {
browser.get(String.format(url_pattern, fid, count));
//System.out.println(browser.getPageSource().substring(browser.getPageSource().indexOf("{"),browser.getPageSource().lastIndexOf("}")));
JSONObject object = JSON.parseObject(browser.getPageSource().substring(browser.getPageSource().indexOf("{"), browser.getPageSource().lastIndexOf("}") + 1));
//System.out.println(object);
JSONArray array = object.getJSONObject("data").getJSONArray("medias");
//System.out.println(object.getJSONObject("data"));
//System.out.println(array);
try {
for (int i = 0; i < array.size(); i++) {
System.out.println(i);
System.out.println(JSON.parseObject(array.get(i).toString()).get("bvid").toString());
bufferedWriter.write(JSON.parseObject(array.get(i).toString()).get("bvid").toString());
bufferedWriter.newLine();
}
Thread.sleep(1000);
} finally {
bufferedWriter.close();
return;
}
}
bufferedWriter.close();
}
}
需要的依赖::
4.0.0
org.example
WebTest
1.0-SNAPSHOT
org.seleniumhq.selenium
selenium-java
3.4.0
com.alibaba
fastjson
1.2.47
8
8
UTF-8
2.下载部分
import requests
import re
import json
from tqdm import tqdm
from lxml import etree
import os
import time
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"referer": "https://message.bilibili.com/"
}
name = ""
class bilibiliSpider:
def __init__(self, cwd):
self.name = ""
self.cwd = cwd
self.exitMusic = os.listdir(os.path.join(self.cwd, "音乐"))
def hundle_name(self, name):
name = name.replace(":", '')
name = name.replace("?", '')
name = name.replace("/", '')
name = name.replace("\\", '')
name = name.replace("|", '')
name = name.replace("<", '')
name = name.replace(">", '')
name = name.replace("*", '')
name = name.replace('"', '')
return name
def download(self, url):
req = requests.get(url, headers=headers)
htmltext = req.text
selector = etree.HTML(htmltext)
links = selector.xpath('//*[@id="viewbox_report"]/h1')
if len(links)==0:
return
self.name = self.hundle_name(links[0].text)
if self.name+".mp3" in self.exitMusic:
print("跳过",self.name)
return
self.get_json(htmltext)
def get_json(self, htmltxt):
time.sleep(0.5)
r = re.findall(r'', htmltxt)[0]
js = json.loads(r)
# with open("1.txt",'w',encoding="utf-8") as f:
# f.write(r)
audiourl = js["data"]["dash"]["audio"][0]["base_url"]
videourl = js["data"]["dash"]["video"][0]["base_url"]
self.downloadAudio(audiourl, videourl)
def downloadWithTqdm(self, url: str, fname: str):
# 用流stream的方式获取url的数据
resp = requests.get(url, stream=True, headers=headers)
# 拿到文件的长度,并把total初始化为0
total = int(resp.headers.get('content-length', 0))
if total/1024000 >8:
print(fname,"过大,已经跳过")
return
# 打开当前目录的fname文件(名字你来传入)
# 初始化tqdm,传入总数,文件名等数据,接着就是写入,更新等操作了
with open(fname, 'wb') as file, tqdm(
desc=fname,
total=total,
unit='iB',
unit_scale=True,
unit_divisor=1024,
) as bar:
for data in resp.iter_content(chunk_size=1024):
size = file.write(data)
bar.update(size)
def downloadAudio(self, audiourl="", videourl=""):
self.downloadWithTqdm(audiourl, self.name + ".mp3")
return
res = requests.get(url=audiourl, headers=headers)
print('爬取中,等待....')
with open(self.name + ".mp3", "wb") as f:
f.write(res.content)
'''
res=requests.get(url=videourl,headers=headers)
with open("你的视频名字.mp4","wb") as f:
f.write(res.content)
print('爬取完毕!')
'''
if __name__ == "__main__":
try:
os.mkdir("音乐")
except:
pass
cwd = os.getcwd()
urls = []
a = bilibiliSpider(cwd)
os.chdir(cwd)
with open("BV.txt", "r", encoding="utf-8") as f:
urls = f.readlines()
os.chdir("音乐")
for i in range(len(urls)):
a.download("https://www.bilibili.com/video/" + urls[i].replace('\n', ''))
time.sleep(0.5)
3.后记
这学期积累不少小想法,我保证寒假会认真更新的。
大概。
喜闻乐见