PPT/毕设.py
2025-04-19 01:06:53 +08:00

147 lines
5.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import time
import imageio
import whisper
import numpy as np
from PIL import Image
from skimage.metrics import structural_similarity as ssim
import tempfile
# 添加FFmpeg路径根据你的实际安装路径修改
os.environ["PATH"] += os.pathsep + r"D:\ffmpeg\bin" # 例如D:\ffmpeg\bin
# ============================== 配置参数 ==============================
# 示例:将视频复制到 D:\test\input.mp4
VIDEO_PATH = "D:/python项目文件/1/input2.mp4" # 输入视频路径
MODEL_DIR = "D:/whisper_models" # 手动下载的模型存放目录
SSIM_THRESHOLD = 0.85 # 关键帧去重阈值
FRAME_INTERVAL = 2 # 抽帧间隔(秒)
OUTPUT_DIR = "output2" # 输出目录
# =====================================================================
def extract_keyframes_with_time(video_path: str) -> tuple:
"""改进版关键帧提取(返回关键帧图像列表和时间戳列表)"""
try:
# 初始化视频读取器
reader = imageio.get_reader(video_path, 'ffmpeg')
fps = reader.get_meta_data().get('fps', 30)
print(f"视频帧率: {fps}fps, 总时长: {reader.get_meta_data()['duration']:.1f}")
keyframes = []
keyframe_times = []
prev_frame = None
frame_counter = 0
for i, frame in enumerate(reader):
# 按间隔抽帧默认每秒抽帧改为每FRAME_INTERVAL秒抽帧
if i % int(fps * FRAME_INTERVAL) != 0:
continue
current_time = i / fps
# 降采样至320x240加速处理
curr_frame = Image.fromarray(frame).resize((320, 240))
if prev_frame is None:
# 首帧强制保留
keyframes.append(curr_frame)
keyframe_times.append(current_time)
prev_frame = np.array(curr_frame.convert('L'))
else:
# 计算灰度图SSIM
curr_gray = np.array(curr_frame.convert('L'))
score = ssim(prev_frame, curr_gray, data_range=255)
if score < SSIM_THRESHOLD:
keyframes.append(curr_frame)
keyframe_times.append(current_time)
prev_frame = curr_gray
frame_counter += 1
if frame_counter % 10 == 0:
print(f"已处理 {current_time:.1f}秒...")
reader.close()
print(f"关键帧提取完成,共{len(keyframes)}")
return keyframes, keyframe_times
except Exception as e:
print(f"视频处理失败: {str(e)}")
return [], []
def align_text_with_keyframes(video_path: str, keyframe_times: list) -> list:
try:
# 1. 动态添加 FFmpeg 路径
ffmpeg_bin = r"D:\ffmpeg\bin"
os.environ["PATH"] = ffmpeg_bin + os.pathsep + os.environ["PATH"]
# 2. 加载模型
model = whisper.load_model("tiny", device="cpu")
# 3. 执行语音识别(不再传递 ffmpeg_path
result = model.transcribe(video_path, fp16=False)
# 4. 对齐处理
alignment = []
kf_ptr = 0
for seg in result["segments"]:
seg_start = seg["start"]
seg_end = seg["end"]
matched_time = None
while kf_ptr < len(keyframe_times):
if keyframe_times[kf_ptr] <= seg_end:
matched_time = keyframe_times[kf_ptr]
kf_ptr += 1
else:
break
if matched_time is not None:
alignment.append({
"text": seg["text"].strip(),
"start": seg_start,
"end": seg_end,
"keyframe_time": matched_time
})
return alignment
except Exception as e:
print(f"语音处理失败: {str(e)}")
return []
def save_results(keyframes, alignment):
"""保存关键帧和文本对齐结果"""
os.makedirs(OUTPUT_DIR, exist_ok=True)
# 保存关键帧
for i, img in enumerate(keyframes):
img.save(os.path.join(OUTPUT_DIR, f"frame_{i:04d}.jpg"))
# 保存对齐文本
with open(os.path.join(OUTPUT_DIR, "alignment.txt"), "w", encoding="utf-8") as f:
for item in alignment:
f.write(
f"[{item['keyframe_time']:.1f}s] "
f"({item['start']:.1f}-{item['end']:.1f}s): "
f"{item['text']}\n"
)
print(f"结果已保存至{OUTPUT_DIR}目录")
# 打印临时目录路径并检查可写权限
temp_dir = tempfile.gettempdir()
print(f"临时目录: {temp_dir}")
if not os.access(temp_dir, os.W_OK):
print("错误:临时目录不可写!")
else:
print("临时目录可写")
if __name__ == "__main__":
# 步骤1: 提取关键帧
keyframes, keyframe_times = extract_keyframes_with_time(VIDEO_PATH)
if not keyframes:
exit()
# 步骤2: 语音对齐
alignment = align_text_with_keyframes(VIDEO_PATH, keyframe_times)
# 步骤3: 保存结果
save_results(keyframes, alignment)