diff --git a/5.0/summary.pdf b/5.0/summary.pdf new file mode 100644 index 0000000..b8c246d Binary files /dev/null and b/5.0/summary.pdf differ diff --git a/5.0/毕设.py b/5.0/毕设.py new file mode 100644 index 0000000..55e3b2d --- /dev/null +++ b/5.0/毕设.py @@ -0,0 +1,548 @@ +import os +import re +import base64 +import warnings +import imageio +import whisper +import numpy as np +import pdfkit +from PIL import Image +from skimage.metrics import structural_similarity as ssim +from collections import defaultdict +import subprocess +from jinja2 import Environment +import cv2 +from scipy.signal import find_peaks +from skimage.feature import hog +from skimage.color import rgb2gray + +# ======================== 全局配置 ======================== +warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead") +VIDEO_PATH = "D:/python项目文件/1/input3.mp4" # 输入视频路径 +MODEL_DIR = "D:/whisper_models" # Whisper模型目录 +FFMPEG_BIN = r"D:\Program Files\ffmpeg\bin" # FFmpeg安装路径 +WKHTMLTOPDF_PATH = r"D:\wkhtmltopdf\bin\wkhtmltopdf.exe" # wkhtmltopdf路径 +SSIM_THRESHOLD = 0.85 # 关键帧去重阈值 +FRAME_INTERVAL = 2 # 抽帧间隔(秒) +OUTPUT_DIR = "D:\桌面文件\python\output" # 输出目录 +TRANSITION_WORDS = ["接下来", "下一页", "如图"] # 过渡词过滤列 +HOG_THRESHOLD = 0.7 # HOG特征相似度阈值 +COLOR_THRESHOLD = 0.8 # 颜色直方图相似度阈值 +WHISPER_MODEL = "base" # Whisper模型大小 +PROFESSIONAL_TERMS = { + "人工智能": "AI", + "机器学习": "ML", + "深度学习": "DL", + "神经网络": "NN", + "卷积神经网络": "CNN", + "循环神经网络": "RNN", + "自然语言处理": "NLP", + "计算机视觉": "CV", + "大数据": "Big Data", + "云计算": "Cloud Computing" +} # 专业术语词典 + + +# ======================================================== + +# ---------------------- 核心功能模块 ---------------------- +class VideoProcessor: + def __init__(self): + os.environ["PATH"] = FFMPEG_BIN + os.pathsep + os.environ["PATH"] + + @staticmethod + def check_ffmpeg(): + """验证FFmpeg可用性""" + try: + subprocess.run(["ffmpeg", "-version"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + print("[系统] FFmpeg验证成功") + return True + except Exception as e: + print(f"[错误] FFmpeg验证失败: {str(e)}") + return False + + @staticmethod + def calculate_color_histogram(frame): + """计算颜色直方图特征""" + hist = cv2.calcHist([frame], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256]) + cv2.normalize(hist, hist) + return hist.flatten() + + @staticmethod + def calculate_hog_features(frame): + """计算HOG特征""" + gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + features = hog(gray, orientations=8, pixels_per_cell=(16, 16), + cells_per_block=(1, 1), visualize=False) + return features + + @staticmethod + def is_ppt_transition(frame1, frame2): + """检测PPT页面切换""" + # 转换为灰度图 + gray1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY) + gray2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY) + + # 计算边缘 + edges1 = cv2.Canny(gray1, 100, 200) + edges2 = cv2.Canny(gray2, 100, 200) + + # 计算边缘差异 + diff = cv2.absdiff(edges1, edges2) + return np.mean(diff) > 50 # 阈值可调整 + + @staticmethod + def extract_keyframes(video_path: str) -> tuple: + """提取去重关键帧及其时间戳(多特征融合)""" + try: + reader = imageio.get_reader(video_path) + fps = reader.get_meta_data()["fps"] + total_frames = reader.count_frames() + print(f"[信息] 视频总帧数: {total_frames}") + + keyframes = [] + timestamps = [] + prev_frame = None + frame_count = 0 + last_progress = 0 + + for idx, frame in enumerate(reader): + # 显示进度 + progress = int((idx / total_frames) * 100) + if progress != last_progress and progress % 5 == 0: # 每5%显示一次进度 + print(f"[进度] 处理中: {progress}% ({idx}/{total_frames}帧)") + last_progress = progress + + curr_time = idx / fps + if curr_time - (timestamps[-1] if timestamps else 0) < FRAME_INTERVAL: + continue + + # 多特征相似度计算 + if prev_frame is not None: + try: + # 1. SSIM相似度(使用简化版本) + gray_prev = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY) + gray_curr = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + ssim_score = ssim(gray_prev, gray_curr, win_size=3) + + # 2. 颜色直方图相似度 + hist_prev = VideoProcessor.calculate_color_histogram(prev_frame) + hist_curr = VideoProcessor.calculate_color_histogram(frame) + color_sim = cv2.compareHist(hist_prev, hist_curr, cv2.HISTCMP_CORREL) + + # 3. HOG特征相似度(仅在SSIM和颜色相似度较高时计算) + if ssim_score > 0.8 and color_sim > 0.8: + hog_prev = VideoProcessor.calculate_hog_features(prev_frame) + hog_curr = VideoProcessor.calculate_hog_features(frame) + hog_sim = np.dot(hog_prev, hog_curr) / (np.linalg.norm(hog_prev) * np.linalg.norm(hog_curr)) + else: + hog_sim = 0 # 如果SSIM和颜色相似度低,直接跳过HOG计算 + + # 4. PPT页面切换检测 + is_transition = VideoProcessor.is_ppt_transition(prev_frame, frame) + + # 综合判断 + if (ssim_score > SSIM_THRESHOLD and + color_sim > COLOR_THRESHOLD and + hog_sim > HOG_THRESHOLD and + not is_transition): + continue + except Exception as e: + print(f"[警告] 特征计算失败: {str(e)}") + continue + + keyframes.append(Image.fromarray(frame)) + timestamps.append(curr_time) + prev_frame = frame + frame_count += 1 + + # 每处理100帧强制垃圾回收 + if frame_count % 100 == 0: + import gc + gc.collect() + + reader.close() + print(f"[图像] 关键帧提取完成,共{len(keyframes)}帧") + return keyframes, timestamps + except Exception as e: + print(f"[错误] 关键帧提取失败: {str(e)}") + return [], [] + + @staticmethod + def transcribe_audio(video_path: str, model_name: str = WHISPER_MODEL) -> list: + """语音识别与时间戳获取(支持中英文混合)""" + try: + # 使用更大的模型提高准确率 + model = whisper.load_model(model_name, device="cpu", download_root=MODEL_DIR) + + # 配置转写参数 + result = model.transcribe( + video_path, + fp16=False, + language="zh", + task="transcribe", + verbose=True, + initial_prompt="这是一段包含中英文的PPT讲解视频,可能包含专业术语。" + ) + + segments = result.get("segments", []) + + # 后处理:专业术语替换 + for seg in segments: + text = seg["text"] + for cn, en in PROFESSIONAL_TERMS.items(): + text = text.replace(cn, f"{cn}({en})") + seg["text"] = text + + return segments + except Exception as e: + print(f"[错误] 语音识别失败: {str(e)}") + return [] + + +# ---------------------- 业务逻辑模块 ---------------------- +class ContentAligner: + @staticmethod + def generate_page_intervals(timestamps: list, duration: float) -> list: + """生成页面时间段""" + intervals = [] + for i in range(len(timestamps)): + start = timestamps[i] + end = timestamps[i + 1] if i < len(timestamps) - 1 else duration + intervals.append((start, end)) + return intervals + + @staticmethod + def calculate_text_similarity(text1: str, text2: str) -> float: + """计算文本相似度""" + # 使用简单的词重叠度计算 + words1 = set(re.findall(r'\w+', text1.lower())) + words2 = set(re.findall(r'\w+', text2.lower())) + if not words1 or not words2: + return 0.0 + intersection = words1.intersection(words2) + union = words1.union(words2) + return len(intersection) / len(union) + + @staticmethod + def find_best_match(segments: list, intervals: list) -> dict: + """为每个语音片段找到最佳匹配的页面""" + page_texts = defaultdict(list) + unmatched_segments = [] + + for seg in segments: + seg_start = seg["start"] + best_match = None + best_score = 0.0 + + # 1. 首先尝试时间戳匹配 + for page_idx, (start, end) in enumerate(intervals): + if start <= seg_start < end: + best_match = page_idx + break + + # 2. 如果时间戳匹配失败,尝试文本相似度匹配 + if best_match is None: + for page_idx, (start, end) in enumerate(intervals): + # 获取该页面的所有文本 + page_text = " ".join([s["text"] for s in segments if start <= s["start"] < end]) + similarity = ContentAligner.calculate_text_similarity(seg["text"], page_text) + if similarity > best_score: + best_score = similarity + best_match = page_idx + + # 3. 如果找到匹配,添加到对应页面 + if best_match is not None: + page_texts[best_match].append(seg) + else: + unmatched_segments.append(seg) + + # 4. 处理未匹配的片段 + if unmatched_segments: + print(f"[警告] 发现{len(unmatched_segments)}个未匹配的语音片段") + # 将未匹配片段添加到最近的页面 + for seg in unmatched_segments: + closest_page = min(range(len(intervals)), + key=lambda i: abs(seg["start"] - (intervals[i][0] + intervals[i][1]) / 2)) + page_texts[closest_page].append(seg) + + return page_texts + + @staticmethod + def align_content(video_path: str, timestamps: list) -> list: + """语音-画面对齐主逻辑(改进版)""" + try: + reader = imageio.get_reader(video_path) + duration = reader.get_meta_data()["duration"] + reader.close() + except: + duration = timestamps[-1] + FRAME_INTERVAL + + segments = VideoProcessor.transcribe_audio(video_path) + intervals = ContentAligner.generate_page_intervals(timestamps, duration) + + # 使用改进的匹配算法 + page_texts = ContentAligner.find_best_match(segments, intervals) + + # 生成最终的对齐数据 + aligned_data = [] + for idx in range(len(intervals)): + text = " ".join([seg["text"] for seg in page_texts.get(idx, [])]) + aligned_data.append({ + "page": idx, + "start_time": intervals[idx][0], + "end_time": intervals[idx][1], + "text": text + }) + + return aligned_data + + +# ---------------------- 摘要生成模块 ---------------------- +class SummaryGenerator: + @staticmethod + def optimize_text(text: str) -> str: + """文本浓缩优化""" + sentences = re.split(r'[。!?]', text) + filtered = [] + seen = set() + for sent in sentences: + sent = sent.strip() + if (len(sent) >= 10 + and not any(word in sent for word in TRANSITION_WORDS) + and sent not in seen): + filtered.append(sent) + seen.add(sent) + return '。'.join(filtered) + '。' if filtered else "" + + @staticmethod + def generate_html(aligned_data: list, keyframes: list, output_dir: str): + """生成HTML报告""" + pages_data = [] + temp_img_dir = os.path.join(output_dir, "_temp_images") + os.makedirs(temp_img_dir, exist_ok=True) + + try: + for idx, frame in enumerate(keyframes): + img_path = os.path.join(temp_img_dir, f"page_{idx}.jpg") + frame.save(img_path) + with open(img_path, "rb") as f: + img_data = base64.b64encode(f.read()).decode("utf-8") + + pages_data.append({ + "num": idx + 1, + "time": f"{aligned_data[idx]['start_time']:.1f}s - {aligned_data[idx]['end_time']:.1f}s", + "image": f"data:image/jpeg;base64,{img_data}", + "text": SummaryGenerator.optimize_text(aligned_data[idx]["text"]) + }) + + env = Environment() + template = env.from_string(""" + + +
+ +