commit 6f3b862bdbc02c8439bd65651ff25f6e2ad7a420 Author: b3ni15 Date: Sun Nov 30 21:28:22 2025 +0100 first commit diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..8fdd834 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,14 @@ +__pycache__/ +*.pyc +.DS_Store +.venv/ +env/ +build/ +dist/ +tts_segments/ +audio.wav +dub_mix.wav +dubbed_output.mp4 +*.json +*.mp4 +*.wav diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..845ec65 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,20 @@ +ARG TARGETPLATFORM +FROM --platform=${TARGETPLATFORM:-linux/amd64} python:3.11-slim-bookworm + +ENV PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +RUN apt-get update \ + && apt-get install -y --no-install-recommends ffmpeg libsndfile1 \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --upgrade pip \ + && pip install -r requirements.txt + +COPY . . + +ENTRYPOINT ["python", "main.py"] diff --git a/main.py b/main.py new file mode 100644 index 0000000..a20fec5 --- /dev/null +++ b/main.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 +import os +import sys +import json +import subprocess +import numpy as np +import soundfile as sf +import torchaudio +import torch +import whisper +from deep_translator import GoogleTranslator +from pyannote.audio import Pipeline +from rich.console import Console +from rich.progress import Progress, BarColumn, TextColumn, TimeRemainingColumn, TimeElapsedColumn + +console = Console() + +# --- Safe-global engedélyezés TorchVersion számára (checkpoint hack) --- +from torch.serialization import add_safe_globals +from torch.torch_version import TorchVersion +add_safe_globals([TorchVersion]) + +# --- FFmpeg és Whisper a saját rendszered pipeline-jához, HF Token kötelező --- +if len(sys.argv) < 2: + console.print("❌ Használat: python3 dub_pipeline.py ") + sys.exit(1) + +video_path = sys.argv[1] +if not os.path.exists(video_path): + console.print("❌ A videó nem található!") + sys.exit(1) + +token = os.environ.get("HF_TOKEN") +if not token: + console.print("❌ HF_TOKEN NINCS beállítva!") + sys.exit(1) + +AUDIO_WAV = "audio.wav" + +# --- 1. Hang kinyerés FFmpeg-gel mono 16kHz WAV-ba --- +console.print("\n🎬 [1/6] Hang kinyerés videóból (mono, 16kHz WAV)…") + +if os.path.exists(AUDIO_WAV): + os.remove(AUDIO_WAV) + +with Progress( + TextColumn("{task.description}"), + BarColumn(), + TimeElapsedColumn(), + TimeRemainingColumn(), +) as progress: + progress.add_task("FFmpeg extract fut…", total=None) + subprocess.run([ + "ffmpeg","-y","-i",video_path, + "-map","0:a:0","-ac","1","-ar","16000","-c:a","pcm_s16le",AUDIO_WAV + ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + +if not os.path.exists(AUDIO_WAV): + console.print("❌ Hang extract sikertelen!") + sys.exit(1) + +console.print("✔ Hang kinyerve: audio.wav\n") + + +# --- 2. Whisper STT (angol beszéd felismerés) --- +console.print("🎤 [2/6] Speech-to-Text (Whisper, EN felismerés)…") + +model = whisper.load_model("large") # stabil, többnyelvű +with Progress(TextColumn("Whisper STT…"), BarColumn(), TimeElapsedColumn(), TimeRemainingColumn()) as p: + p.add_task("STT fut…", total=None) + result = model.transcribe(AUDIO_WAV, language="en") + +segments = result.get("segments", []) +console.print(f"✔ STT kész, szegmensek száma: {len(segments)}") + +# --- 2.5 Fordítás angol → magyarra --- +console.print("\n🌍 Angol → Magyar fordítás…") +translator = GoogleTranslator(source="en", target="hu") + +translated = [] +for seg in segments: + text_en = seg["text"].strip() + text_hu = translator.translate(text_en) if text_en else "" + translated.append({ + "speaker": "UNKNOWN", # később diarization fogja kitölteni + "start": seg["start"], + "end": seg["end"], + "en": text_en, + "hu": text_hu + }) + +with open("segments_translated.json","w",encoding="utf-8") as f: + json.dump(translated,f,ensure_ascii=False,indent=2) + +console.print("✔ Fordított szegmensek mentve: segments_translated.json\n") + + +# --- 3. Beszélők felismerése (Pyannote diarization) --- +console.print("👥 [3/6] Beszélők diarization (pyannote/speaker-diarization-3.1)…") + +waveform, sample_rate = torchaudio.load(AUDIO_WAV) +audio_dict = {"waveform": waveform, "sample_rate": sample_rate} + +diar_pipeline = Pipeline.from_pretrained( + "pyannote/speaker-diarization-3.1", + token=token +) + +with Progress(TextColumn("Diarization fut…"), BarColumn(), TimeElapsedColumn(), TimeRemainingColumn()) as p: + p.add_task("Beszélők szétválasztása…", total=None) + diarization = diar_pipeline(audio_dict) + +# speaker szegmensek bejárása +speaker_segments = [] +speakers = set() + +for segment, _, speaker in diarization.itertracks(yield_label=True): + speakers.add(speaker) + speaker_segments.append({ + "speaker": speaker, + "start": float(segment.start), + "end": float(segment.end) + }) + +with open("diar.json","w",encoding="utf-8") as f: + json.dump(speaker_segments,f,ensure_ascii=False,indent=2) + +console.print(f"✔ Diarization kész, beszélők: {sorted(list(speakers))}\n") + +# --- 4. Speaker idő alapján speaker kitöltése a translated szegmensekhez --- +console.print("🧠 [4/6] Speaker ID STT → speaker idő alapján…") + +def find_speaker_for_time(t): + for s in speaker_segments: + if s["start"] <= t <= s["end"]: + return s["speaker"] + return "UNKNOWN" + +aligned = [] +for item in translated: + spk = find_speaker_for_time(item["start"]) + aligned.append({ + "speaker": spk, + "start": item["start"], + "end": item["end"], + "en": item["en"], + "hu": item["hu"] + }) + +with open("aligned_segments.json","w",encoding="utf-8") as f: + json.dump(aligned,f,ensure_ascii=False,indent=2) + +console.print("✔ Speakerhez rendelt szegmensek mentve: aligned_segments.json\n") + + +# --- 5. TTS: speakerenként külön magyar hang (edge-tts) --- +console.print("🎙 [5/6] Magyar hang generálás speakerenként külön voice-al…") + +available_voices = [ + "hu-HU-NoemiNeural", + "hu-HU-TamasNeural", + "hu-HU-LillaNeural", + "hu-HU-ImreNeural", +] + +spk_list = sorted(list({s["speaker"] for s in aligned})) +voice_map = {} + +for i, spk in enumerate(spk_list): + voice_map[spk] = available_voices[i % len(available_voices)] + +console.print("🗣 Speaker → Voice kiosztás:") +for k,v in voice_map.items(): + console.print(f" {k} → {v}") + +os.makedirs("tts_segments", exist_ok=True) +tts_meta = [] + +for i, seg in enumerate(aligned): + spk = seg["speaker"] + text = seg["hu"].strip() + if not text: + continue + + voice = voice_map.get(spk, available_voices[0]) + out_path = os.path.join("tts_segments", f"tts_{i:04d}.wav") + + subprocess.run([ + "edge-tts", "--voice", voice, + "--text", text, + "--write-media", out_path + ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + + if os.path.exists(out_path): + tts_meta.append({"path": out_path, "start": seg["start"]}) + +console.print(f"\n✔ {len(tts_meta)} db TTS hang-szegmens elkészült\n") + + +# --- 6. Magyar TTS sáv összeépítés + videóba muxolás --- +console.print("🎚 [6/6] Magyar dub hangsáv létrehozása és visszaillesztése videóra…") + +if not tts_meta: + console.print("❌ Nincs TTS, kilépek.") + sys.exit(1) + +video_duration = aligned[-1]["end"] +target_sr = 48000 +samples = int((video_duration+1)*target_sr) +mix = np.zeros(samples, dtype=np.float32) + +for tts in tts_meta: + data_tts, sr_tts = sf.read(tts["path"]) + if sr_tts != target_sr: + res = torchaudio.transforms.Resample(sr_tts, target_sr) + data_tts = res(torch.from_numpy(data_tts).float().unsqueeze(0)).squeeze(0).numpy() + + start_i = int(tts["start"]*target_sr) + end_i = start_i + len(data_tts) + if end_i > len(mix): + data_tts = data_tts[:len(mix)-start_i] + end_i = len(mix) + + mix[start_i:end_i] += data_tts + +mix = mix / (np.max(np.abs(mix))+1e-9) * 0.9 +sf.write("dub_mix.wav", mix, target_sr) +console.print("✔ Dub sáv mentve: dub_mix.wav") + +subprocess.run([ + "ffmpeg","-y","-i",video_path, + "-i","dub_mix.wav", + "-map","0:v:0","-map","1:a:0", + "-c:v","copy","-c:a","aac","-shortest", + "dubbed_output.mp4" +], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + +console.print("\n✅ Magyar AI dub kész: dubbed_output.mp4") \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5c6976a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +openai-whisper +pyannote.audio +torch +torchaudio +numpy +soundfile +rich +deep-translator +edge-tts \ No newline at end of file