使用 Vegasaur 導出 Vegas Pro 視頻項目中的字幕

我想要將 Vegas Pro 中的字幕和文字導出並生成 Youtube 的 Chapter 章節。一開始我嘗試了 Tools 里的 Scripting，但是並沒有成功。後來找到 Vegasaur Toolkit 這個工具，它有 30 天免費試用，可以將軌道上的文字導出成 srt 字幕文件。然後我使用 ffmpeg 將 srt 轉換成 vtt 字幕文件，最後再通過 Python 腳本將其轉換成 Youtube 支持的章節文本文檔。

下載 Vegasaur Toolkit，並安裝。我的 Vegas Pro 版本是 18，我安裝的 Vegasaur Toolkit 版本是 3.9.5。安裝完，打開 .veg 文件，鼠標選中要導出的字幕軌道，然後點擊 View > Extensions > Vegasaur > Timeline > Text Generation Wizard.在彈出的窗口中選擇 Export text events，然後點 Next。然後選擇 Selected Tracks 和 Save to File 並設置要導出的文件名和文件格式。最後點擊 Finish 就導出了。

剩下的步驟我使用了腳本來輔助完成。我有兩條字幕軌道，章節文本基本上都位於第一條軌道，但是偶爾第而條軌道也會有，所有我還需要合併一下兩個軌道導出的字幕文件：

#vtt utils
import subprocess
import sys
from datetime import timedelta

#usage:

#srt to youtube chapters
#python main.py srt2yt 1.srt 2.srt

def add_style_to_vtt(input_file, output_file, style):
    with open(input_file, 'r') as f:
        lines = f.readlines()

    with open(output_file, 'w') as f:
        for line in lines:
            if '-->' in line:
                start, end = line.strip().split(' --> ')
                # Check if the timecodes are less than 60 minutes
                if len(start) <= 9:  # Check if start time is less than or equal to 60 minutes
                    start = '00:' + start
                if len(end) <= 9:  # Check if end time is less than or equal to 60 minutes
                    end = '00:' + end				
                # Add style settings after the timecode
                #line = line.strip() + ' ' + style + '\n'
                line = f'{start} --> {end} {style}\n'                
            f.write(line)

def combine_two_vtt(input_file_1, input_file_2, output_file):
    # with open(input_file_1, 'r') as f:
    #     lines_1 = f.readlines()

    # with open(input_file_2, 'r') as f:
    #     lines_2 = f.readlines()
    # with open(output_file, 'w') as f:
    #     f.writelines(lines_1 + lines_2)
    vtt1_lines = parse_vtt_file(input_file_1)
    vtt2_lines = parse_vtt_file(input_file_2)

    lines = vtt1_lines + vtt2_lines
    write_sorted_vtt(lines, output_file)

def time_to_seconds(time_str):
    try:
        hours, minutes, seconds = map(float, time_str.split(':'))
    except:
        hours = 0
        minutes, seconds = map(float, time_str.split(':'))
    return hours * 3600 + minutes * 60 + seconds

def seconds_to_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    return f'{hours:02.0f}:{minutes:02.0f}:{seconds:06.3f}'

def youtube_chapters(input_file, output_file):
    with open(input_file, 'r') as f:
        lines = f.readlines()

    with open(output_file, 'w') as f:
        last_time = '00:00'
        i = 0
        while i < len(lines):
            if '-->' in lines[i]:
                caption_list=[]
                first_dot_index = lines[i].find('.')
                current_time = lines[i][:first_dot_index]
                #if current_time - last time < 10: then current time = last time + 10 seconds
                if time_to_seconds(current_time) - time_to_seconds(last_time) < 10:
                    current_time = seconds_to_time(time_to_seconds(last_time) + 10)
                # set the current time to the last time
                last_time = current_time
                caption_list.append(current_time)

                while i < len(lines)-1 and lines[i+1] != '\n':
                    caption_list.append(lines[i+1].strip())
                    i += 1

                if len(caption_list) == 4:
                    #remove the third element of list
                    caption_list.pop(2)
                caption = ' '.join(caption_list)
                f.write(caption + '\n')
            i += 1

def parse_vtt_timestamp(ts):
    """Parses a VTT timestamp into a timedelta object."""
    parts = ts.split(":")
    if len(parts) == 3:
        h, m, s = parts
    elif len(parts) == 2:
        h = 0
        m, s = parts
    else:
        raise ValueError(f"Unexpected timestamp format: {ts}")
    s, ms = s.split(".")
    return timedelta(hours=int(h), minutes=int(m), seconds=int(s), milliseconds=int(ms))

def parse_vtt_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    entries = []
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        if "-->" in line:
            timestamp = line
            start_time = parse_vtt_timestamp(timestamp.split(" --> ")[0])
            content = []
            i += 1
            while i < len(lines) and lines[i].strip() != "":
                content.append(lines[i].rstrip('\n'))
                i += 1
            entries.append((start_time, timestamp, content))
        else:
            i += 1
    return entries

def write_sorted_vtt(entries, output_path):
    entries.sort(key=lambda x: x[0])
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write("WEBVTT\n\n")
        for _, timestamp, content in entries:
            f.write(f"{timestamp}\n")
            for line in content:
                f.write(f"{line}\n")
            f.write("\n")

def format_youtube_time(td):
    total_seconds = int(td.total_seconds())
    hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60
    seconds = total_seconds % 60
    if hours > 0:
        return f"{hours:02}:{minutes:02}:{seconds:02}"
    else:
        return f"{minutes:02}:{seconds:02}"

def parse_vtt_for_chapters(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    entries = []
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        if "-->" in line:
            timestamp_line = line
            start_str, end_str = timestamp_line.split(" --> ")
            start = parse_vtt_timestamp(start_str.strip())
            end = parse_vtt_timestamp(end_str.strip())
            duration = end - start

            i += 1
            content = []
            while i < len(lines) and lines[i].strip():
                content.append(lines[i].strip())
                i += 1

            # Expected format: Chinese name on 1st line, English name on 3rd line
            if len(content) >= 2:
                chinese_name = content[0]
                english_name = content[-1]
                entries.append((chinese_name, english_name, start, duration))
        else:
            i += 1

    # Deduplicate: keep only the longest-duration entry per Chinese name
    chapter_dict = {}
    for cname, ename, start, duration in entries:
        if cname not in chapter_dict or duration > chapter_dict[cname][1]:
            chapter_dict[cname] = (ename, duration, start)

    # Sort by start time
    sorted_chapters = sorted(
        [(data[2], cname, data[0]) for cname, data in chapter_dict.items()],
        key=lambda x: x[0]
    )

    return sorted_chapters

def write_youtube_chapters(chapters, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        for start, cname, ename in chapters:
            time_str = format_youtube_time(start)
            f.write(f"{time_str} {cname} {ename}\n")

if __name__ == "__main__":
    # choose function from command line arguments
    if len(sys.argv) > 2:
        # combine two vtt files
        func = sys.argv[1]
        if func == 'combine':
            input_file_1 = sys.argv[2]
            input_file_2 = sys.argv[3]
            output_file = input_file_1[:-4] + '.combined.vtt'
            combine_two_vtt(input_file_1, input_file_2, output_file)
        elif func == 'style':
            input_file = sys.argv[2]
            output_file = input_file[:-4] + '.style.vtt'
            style = sys.argv[3]
            # style = 'position:100% align:right size:50%'
            # style = 'position:0% align:left size:50%'
            add_style_to_vtt(input_file, output_file, style)
        elif func == 'youtube':
            input_file = sys.argv[2]
            output_file = input_file[:-4] + '.youtube.txt'
            # youtube_chapters(input_file, output_file)
            chapters = parse_vtt_for_chapters(input_file)
            write_youtube_chapters(chapters, output_file)
        elif func == 'srt2yt':
            input_file_1 = sys.argv[2]
            input_file_2 = sys.argv[3]
            vtt1 = input_file_1[:-4] + '.vtt'
            vtt2 = input_file_2[:-4] + '.vtt'
            subprocess.run(['ffmpeg', '-i', input_file_1, vtt1])
            subprocess.run(['ffmpeg', '-i', input_file_2, vtt2])
            #wait for user to input anything to continue
            input("Press enter to continue...")
            combined_file = input_file_1[:-4] + '.combined.vtt'
            combine_two_vtt(vtt1, vtt2, combined_file)
            yt_file = input_file_1[:-4] + '.youtube.txt'
            chapters = parse_vtt_for_chapters(combined_file)
            write_youtube_chapters(chapters, yt_file)

腳本中還有一些 Youtube 支持的 vtt 格式字幕樣式的嘗試。

發佈留言 取消回覆

發佈留言取消回覆