关于open sora plan数据集--下载篇
下载渠道
- cc0 协议网站 pexels
- 开源数据集 panda70m
- youtube8m
- 待添加
下载准备工作
- 下载远程主机
- ssh 本地存储机器
下载专用代码
-
所有代码只给出下载片段,需要海量下载请自行修改逻辑,或者联系本人。
-
pexels
import requests
import os
def save_video(slug, author_id, video_id, fps, width):
#下载一个视频所需要的参数
#https://download.pexels.com/vimeo/364640745/pexels-magda-ehlers-3040808.mp4?fps=29.97&width=3840
save_folder = './'
#修改为自己的headers
headers = {}
# 避免被ban IP 修改为IP代理
proxies = {}
video_url = f'https://download.pexels.com/vimeo/{video_id}/{slug}-{author_id}.mp4?fps={fps}&width={width}'
resp = requests.get(video_url, headers=headers, proxies=proxies, timeout=10)
file_name = os.path.join(save_folder, str(author_id)+'.mp4')
with open(file_name, 'wb') as w:
w.write(resp.content)
- panda70m youtube8m 都是一样的下载方法
- 提前安装yt-dlp库
import os
import os.path as osp
import json
import yt_dlp
def ytb_download(uid, url, json_info, output_dir="ytb_videos/"):
os.makedirs(output_dir, exist_ok=True)
yt_opts = {
"format": "best", # Download the best quality available
"outtmpl": osp.join(output_dir, f"{uid}.%(ext)s"),
"proxy": "",
"postprocessors": [
{
"key": "FFmpegVideoConvertor",
"preferedformat": "mp4", # Convert video to mp4 format
}
],
video_path = osp.join(output_dir, f"{uid}.mp4")
meta_path = osp.join(output_dir, f"{uid}.json")
if osp.exists(video_path) and osp.exists(meta_path):
print(f"{uid} already labeled.")
return 0
try:
with yt_dlp.YoutubeDL(yt_opts) as ydl:
ydl.download([url])
with open(osp.join(output_dir, f"{uid}.json"), "w") as fp:
json.dump(json_info, fp, indent=2)
return 0
except:
return -1
- 以上代码请根据需求可改多线程,或者协程
评论0