147 lines
5.1 KiB
Python
147 lines
5.1 KiB
Python
|
import os
|
|||
|
import time
|
|||
|
import imageio
|
|||
|
import whisper
|
|||
|
import numpy as np
|
|||
|
from PIL import Image
|
|||
|
from skimage.metrics import structural_similarity as ssim
|
|||
|
import tempfile
|
|||
|
|
|||
|
# 添加FFmpeg路径(根据你的实际安装路径修改)
|
|||
|
os.environ["PATH"] += os.pathsep + r"D:\ffmpeg\bin" # 例如:D:\ffmpeg\bin
|
|||
|
# ============================== 配置参数 ==============================
|
|||
|
# 示例:将视频复制到 D:\test\input.mp4
|
|||
|
VIDEO_PATH = "D:/python项目文件/1/input2.mp4" # 输入视频路径
|
|||
|
MODEL_DIR = "D:/whisper_models" # 手动下载的模型存放目录
|
|||
|
SSIM_THRESHOLD = 0.85 # 关键帧去重阈值
|
|||
|
FRAME_INTERVAL = 2 # 抽帧间隔(秒)
|
|||
|
OUTPUT_DIR = "output2" # 输出目录
|
|||
|
|
|||
|
|
|||
|
# =====================================================================
|
|||
|
|
|||
|
def extract_keyframes_with_time(video_path: str) -> tuple:
|
|||
|
"""改进版关键帧提取(返回关键帧图像列表和时间戳列表)"""
|
|||
|
try:
|
|||
|
# 初始化视频读取器
|
|||
|
reader = imageio.get_reader(video_path, 'ffmpeg')
|
|||
|
fps = reader.get_meta_data().get('fps', 30)
|
|||
|
print(f"视频帧率: {fps}fps, 总时长: {reader.get_meta_data()['duration']:.1f}秒")
|
|||
|
|
|||
|
keyframes = []
|
|||
|
keyframe_times = []
|
|||
|
prev_frame = None
|
|||
|
frame_counter = 0
|
|||
|
|
|||
|
for i, frame in enumerate(reader):
|
|||
|
# 按间隔抽帧(默认每秒抽帧改为每FRAME_INTERVAL秒抽帧)
|
|||
|
if i % int(fps * FRAME_INTERVAL) != 0:
|
|||
|
continue
|
|||
|
|
|||
|
current_time = i / fps
|
|||
|
# 降采样至320x240加速处理
|
|||
|
curr_frame = Image.fromarray(frame).resize((320, 240))
|
|||
|
|
|||
|
if prev_frame is None:
|
|||
|
# 首帧强制保留
|
|||
|
keyframes.append(curr_frame)
|
|||
|
keyframe_times.append(current_time)
|
|||
|
prev_frame = np.array(curr_frame.convert('L'))
|
|||
|
else:
|
|||
|
# 计算灰度图SSIM
|
|||
|
curr_gray = np.array(curr_frame.convert('L'))
|
|||
|
score = ssim(prev_frame, curr_gray, data_range=255)
|
|||
|
|
|||
|
if score < SSIM_THRESHOLD:
|
|||
|
keyframes.append(curr_frame)
|
|||
|
keyframe_times.append(current_time)
|
|||
|
prev_frame = curr_gray
|
|||
|
|
|||
|
frame_counter += 1
|
|||
|
if frame_counter % 10 == 0:
|
|||
|
print(f"已处理 {current_time:.1f}秒...")
|
|||
|
|
|||
|
reader.close()
|
|||
|
print(f"关键帧提取完成,共{len(keyframes)}帧")
|
|||
|
return keyframes, keyframe_times
|
|||
|
except Exception as e:
|
|||
|
print(f"视频处理失败: {str(e)}")
|
|||
|
return [], []
|
|||
|
|
|||
|
|
|||
|
def align_text_with_keyframes(video_path: str, keyframe_times: list) -> list:
|
|||
|
try:
|
|||
|
# 1. 动态添加 FFmpeg 路径
|
|||
|
ffmpeg_bin = r"D:\ffmpeg\bin"
|
|||
|
os.environ["PATH"] = ffmpeg_bin + os.pathsep + os.environ["PATH"]
|
|||
|
|
|||
|
# 2. 加载模型
|
|||
|
model = whisper.load_model("tiny", device="cpu")
|
|||
|
|
|||
|
# 3. 执行语音识别(不再传递 ffmpeg_path)
|
|||
|
result = model.transcribe(video_path, fp16=False)
|
|||
|
|
|||
|
# 4. 对齐处理
|
|||
|
alignment = []
|
|||
|
kf_ptr = 0
|
|||
|
for seg in result["segments"]:
|
|||
|
seg_start = seg["start"]
|
|||
|
seg_end = seg["end"]
|
|||
|
matched_time = None
|
|||
|
while kf_ptr < len(keyframe_times):
|
|||
|
if keyframe_times[kf_ptr] <= seg_end:
|
|||
|
matched_time = keyframe_times[kf_ptr]
|
|||
|
kf_ptr += 1
|
|||
|
else:
|
|||
|
break
|
|||
|
if matched_time is not None:
|
|||
|
alignment.append({
|
|||
|
"text": seg["text"].strip(),
|
|||
|
"start": seg_start,
|
|||
|
"end": seg_end,
|
|||
|
"keyframe_time": matched_time
|
|||
|
})
|
|||
|
return alignment
|
|||
|
except Exception as e:
|
|||
|
print(f"语音处理失败: {str(e)}")
|
|||
|
return []
|
|||
|
|
|||
|
def save_results(keyframes, alignment):
|
|||
|
"""保存关键帧和文本对齐结果"""
|
|||
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|||
|
|
|||
|
# 保存关键帧
|
|||
|
for i, img in enumerate(keyframes):
|
|||
|
img.save(os.path.join(OUTPUT_DIR, f"frame_{i:04d}.jpg"))
|
|||
|
|
|||
|
# 保存对齐文本
|
|||
|
with open(os.path.join(OUTPUT_DIR, "alignment.txt"), "w", encoding="utf-8") as f:
|
|||
|
for item in alignment:
|
|||
|
f.write(
|
|||
|
f"[{item['keyframe_time']:.1f}s] "
|
|||
|
f"({item['start']:.1f}-{item['end']:.1f}s): "
|
|||
|
f"{item['text']}\n"
|
|||
|
)
|
|||
|
print(f"结果已保存至{OUTPUT_DIR}目录")
|
|||
|
|
|||
|
|
|||
|
|
|||
|
# 打印临时目录路径并检查可写权限
|
|||
|
temp_dir = tempfile.gettempdir()
|
|||
|
print(f"临时目录: {temp_dir}")
|
|||
|
if not os.access(temp_dir, os.W_OK):
|
|||
|
print("错误:临时目录不可写!")
|
|||
|
else:
|
|||
|
print("临时目录可写")
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
# 步骤1: 提取关键帧
|
|||
|
keyframes, keyframe_times = extract_keyframes_with_time(VIDEO_PATH)
|
|||
|
if not keyframes:
|
|||
|
exit()
|
|||
|
|
|||
|
# 步骤2: 语音对齐
|
|||
|
alignment = align_text_with_keyframes(VIDEO_PATH, keyframe_times)
|
|||
|
|
|||
|
# 步骤3: 保存结果
|
|||
|
save_results(keyframes, alignment)
|