first commit

This commit is contained in:
2025-11-30 21:28:22 +01:00
commit 6f3b862bdb
4 changed files with 281 additions and 0 deletions

14
.dockerignore Normal file
View File

@@ -0,0 +1,14 @@
__pycache__/
*.pyc
.DS_Store
.venv/
env/
build/
dist/
tts_segments/
audio.wav
dub_mix.wav
dubbed_output.mp4
*.json
*.mp4
*.wav

20
Dockerfile Normal file
View File

@@ -0,0 +1,20 @@
ARG TARGETPLATFORM
FROM --platform=${TARGETPLATFORM:-linux/amd64} python:3.11-slim-bookworm
ENV PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1
RUN apt-get update \
&& apt-get install -y --no-install-recommends ffmpeg libsndfile1 \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY requirements.txt .
RUN pip install --upgrade pip \
&& pip install -r requirements.txt
COPY . .
ENTRYPOINT ["python", "main.py"]

238
main.py Normal file
View File

@@ -0,0 +1,238 @@
#!/usr/bin/env python3
import os
import sys
import json
import subprocess
import numpy as np
import soundfile as sf
import torchaudio
import torch
import whisper
from deep_translator import GoogleTranslator
from pyannote.audio import Pipeline
from rich.console import Console
from rich.progress import Progress, BarColumn, TextColumn, TimeRemainingColumn, TimeElapsedColumn
console = Console()
# --- Safe-global engedélyezés TorchVersion számára (checkpoint hack) ---
from torch.serialization import add_safe_globals
from torch.torch_version import TorchVersion
add_safe_globals([TorchVersion])
# --- FFmpeg és Whisper a saját rendszered pipeline-jához, HF Token kötelező ---
if len(sys.argv) < 2:
console.print("❌ Használat: python3 dub_pipeline.py <video.mp4>")
sys.exit(1)
video_path = sys.argv[1]
if not os.path.exists(video_path):
console.print("❌ A videó nem található!")
sys.exit(1)
token = os.environ.get("HF_TOKEN")
if not token:
console.print("❌ HF_TOKEN NINCS beállítva!")
sys.exit(1)
AUDIO_WAV = "audio.wav"
# --- 1. Hang kinyerés FFmpeg-gel mono 16kHz WAV-ba ---
console.print("\n🎬 [1/6] Hang kinyerés videóból (mono, 16kHz WAV)…")
if os.path.exists(AUDIO_WAV):
os.remove(AUDIO_WAV)
with Progress(
TextColumn("{task.description}"),
BarColumn(),
TimeElapsedColumn(),
TimeRemainingColumn(),
) as progress:
progress.add_task("FFmpeg extract fut…", total=None)
subprocess.run([
"ffmpeg","-y","-i",video_path,
"-map","0:a:0","-ac","1","-ar","16000","-c:a","pcm_s16le",AUDIO_WAV
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
if not os.path.exists(AUDIO_WAV):
console.print("❌ Hang extract sikertelen!")
sys.exit(1)
console.print("✔ Hang kinyerve: audio.wav\n")
# --- 2. Whisper STT (angol beszéd felismerés) ---
console.print("🎤 [2/6] Speech-to-Text (Whisper, EN felismerés)…")
model = whisper.load_model("large") # stabil, többnyelvű
with Progress(TextColumn("Whisper STT…"), BarColumn(), TimeElapsedColumn(), TimeRemainingColumn()) as p:
p.add_task("STT fut…", total=None)
result = model.transcribe(AUDIO_WAV, language="en")
segments = result.get("segments", [])
console.print(f"✔ STT kész, szegmensek száma: {len(segments)}")
# --- 2.5 Fordítás angol → magyarra ---
console.print("\n🌍 Angol → Magyar fordítás…")
translator = GoogleTranslator(source="en", target="hu")
translated = []
for seg in segments:
text_en = seg["text"].strip()
text_hu = translator.translate(text_en) if text_en else ""
translated.append({
"speaker": "UNKNOWN", # később diarization fogja kitölteni
"start": seg["start"],
"end": seg["end"],
"en": text_en,
"hu": text_hu
})
with open("segments_translated.json","w",encoding="utf-8") as f:
json.dump(translated,f,ensure_ascii=False,indent=2)
console.print("✔ Fordított szegmensek mentve: segments_translated.json\n")
# --- 3. Beszélők felismerése (Pyannote diarization) ---
console.print("👥 [3/6] Beszélők diarization (pyannote/speaker-diarization-3.1)…")
waveform, sample_rate = torchaudio.load(AUDIO_WAV)
audio_dict = {"waveform": waveform, "sample_rate": sample_rate}
diar_pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
token=token
)
with Progress(TextColumn("Diarization fut…"), BarColumn(), TimeElapsedColumn(), TimeRemainingColumn()) as p:
p.add_task("Beszélők szétválasztása…", total=None)
diarization = diar_pipeline(audio_dict)
# speaker szegmensek bejárása
speaker_segments = []
speakers = set()
for segment, _, speaker in diarization.itertracks(yield_label=True):
speakers.add(speaker)
speaker_segments.append({
"speaker": speaker,
"start": float(segment.start),
"end": float(segment.end)
})
with open("diar.json","w",encoding="utf-8") as f:
json.dump(speaker_segments,f,ensure_ascii=False,indent=2)
console.print(f"✔ Diarization kész, beszélők: {sorted(list(speakers))}\n")
# --- 4. Speaker idő alapján speaker kitöltése a translated szegmensekhez ---
console.print("🧠 [4/6] Speaker ID STT → speaker idő alapján…")
def find_speaker_for_time(t):
for s in speaker_segments:
if s["start"] <= t <= s["end"]:
return s["speaker"]
return "UNKNOWN"
aligned = []
for item in translated:
spk = find_speaker_for_time(item["start"])
aligned.append({
"speaker": spk,
"start": item["start"],
"end": item["end"],
"en": item["en"],
"hu": item["hu"]
})
with open("aligned_segments.json","w",encoding="utf-8") as f:
json.dump(aligned,f,ensure_ascii=False,indent=2)
console.print("✔ Speakerhez rendelt szegmensek mentve: aligned_segments.json\n")
# --- 5. TTS: speakerenként külön magyar hang (edge-tts) ---
console.print("🎙 [5/6] Magyar hang generálás speakerenként külön voice-al…")
available_voices = [
"hu-HU-NoemiNeural",
"hu-HU-TamasNeural",
"hu-HU-LillaNeural",
"hu-HU-ImreNeural",
]
spk_list = sorted(list({s["speaker"] for s in aligned}))
voice_map = {}
for i, spk in enumerate(spk_list):
voice_map[spk] = available_voices[i % len(available_voices)]
console.print("🗣 Speaker → Voice kiosztás:")
for k,v in voice_map.items():
console.print(f" {k}{v}")
os.makedirs("tts_segments", exist_ok=True)
tts_meta = []
for i, seg in enumerate(aligned):
spk = seg["speaker"]
text = seg["hu"].strip()
if not text:
continue
voice = voice_map.get(spk, available_voices[0])
out_path = os.path.join("tts_segments", f"tts_{i:04d}.wav")
subprocess.run([
"edge-tts", "--voice", voice,
"--text", text,
"--write-media", out_path
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
if os.path.exists(out_path):
tts_meta.append({"path": out_path, "start": seg["start"]})
console.print(f"\n{len(tts_meta)} db TTS hang-szegmens elkészült\n")
# --- 6. Magyar TTS sáv összeépítés + videóba muxolás ---
console.print("🎚 [6/6] Magyar dub hangsáv létrehozása és visszaillesztése videóra…")
if not tts_meta:
console.print("❌ Nincs TTS, kilépek.")
sys.exit(1)
video_duration = aligned[-1]["end"]
target_sr = 48000
samples = int((video_duration+1)*target_sr)
mix = np.zeros(samples, dtype=np.float32)
for tts in tts_meta:
data_tts, sr_tts = sf.read(tts["path"])
if sr_tts != target_sr:
res = torchaudio.transforms.Resample(sr_tts, target_sr)
data_tts = res(torch.from_numpy(data_tts).float().unsqueeze(0)).squeeze(0).numpy()
start_i = int(tts["start"]*target_sr)
end_i = start_i + len(data_tts)
if end_i > len(mix):
data_tts = data_tts[:len(mix)-start_i]
end_i = len(mix)
mix[start_i:end_i] += data_tts
mix = mix / (np.max(np.abs(mix))+1e-9) * 0.9
sf.write("dub_mix.wav", mix, target_sr)
console.print("✔ Dub sáv mentve: dub_mix.wav")
subprocess.run([
"ffmpeg","-y","-i",video_path,
"-i","dub_mix.wav",
"-map","0:v:0","-map","1:a:0",
"-c:v","copy","-c:a","aac","-shortest",
"dubbed_output.mp4"
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
console.print("\n✅ Magyar AI dub kész: dubbed_output.mp4")

9
requirements.txt Normal file
View File

@@ -0,0 +1,9 @@
openai-whisper
pyannote.audio
torch
torchaudio
numpy
soundfile
rich
deep-translator
edge-tts