[解包] 常轨脱离(ハミダシ)语音与文本解包制作GPTSoVits数据集

1214 字

6 分钟

[解包] 常轨脱离(ハミダシ)语音与文本解包制作GPTSoVits数据集

2025-08-29

数据集制作

数据集

/

解包

/

TTS

/

GPTSoVits

/

Lua

需要准备什么#

游戏本体
Lua环境，提取文本用
~~Node.js，写脚本用~~ 既然用Lua了就直接全用Lua写算了
GARbro，解包用
FFmpeg，.ogg转.wav，记得添加到环境变量

解包语音和脚本资源#

运行GARbro.GUI.exe，点击左上角文件->打开，选择游戏目录内的hamidashi.pfs

将script目录内所有内容（扩展名.ast的文件）导出备用，这个目录内即为脚本文件

之后打开游戏目录内的hamidashi.pfs.000，将sound/vo/名简写，比如我要导出錦あすみ的语音，就导出sound/vo/asu内的所有.ogg音频

（不过如果你在找妃愛的语音的话，还有一部分在hamidashi.pfs.001的sound/vo/hiy）

将所有需要的.ogg音频导出到一个文件夹内备用，暂且称之voice_ogg目录吧

提取语音对应的文本#

不同于很多使用经典的.json的其他galgame，解析这个~~神秘的~~.ast的过程相当波折

一段例子

1
text={
2
    [1]={
3
        vo={
4
            {"vo",file="fem_mir_10142",ch="mir"},
5
        },
6
        name={name="里"},
7
        ja={
8
            {
9
            "「今日はほんとお疲れー！　最後までみんな笑いすぎでウケる！　うち、しおぽよがあれだけ楽しそうにしてるの初めて見たし！」",
10
            },
11
        },
12
    },
13
...
14
}

一开始看到这神奇的语法还以为是什么私有的标记语言，然后跟AI对线一小时整了个正则表达式

后来注意到怎么从1开始呢，不会跟lua有关吧，查了一下才知道原来这整个文件就是个lua脚本（~~电脑里有一款索引从1开始的语言~~）

2025-11-25: 才知道原来这个引擎就是基于Lua脚本的，早知道应该先调查一下的

提取文本与对应音频文件名生成json的lua脚本#

将该脚本与dkjson.lua放置在与script目录同级

1
local json = require("dkjson")
2

3
local lfs = require("lfs")
4

5
local script_dir = "script"
6

7
local target_name = "妃愛" -- 名字在.ast文件里能找到
8
local ja_name = nil -- name={name="あすみ",ja="雪景シキ"}遇到类似想提取雪景シキ的情况时就填入"雪景シキ"，其余情况填nil，该项若非空会覆盖target_name
9

10
local all_data = {}
11

12
for file in lfs.dir(script_dir) do
13
    if file:match("%.ast$") then
14
        local path = script_dir .. "/" .. file
15
        dofile(path)
16
        local ast_data = _G.ast
17
        if ast_data and ast_data.text then
18
            for k, v in pairs(ast_data.text) do
19
                local should_insert = false
20

21
                if ja_name then
22
                    if v.name and v.name.ja and v.name.ja == ja_name then
23
                        should_insert = true
24
                    end
25
                else
26
                    if v.name and v.name.name and target_name == v.name.name then
27
                        should_insert = true
28
                    end
29
                end
30

31
                if should_insert then
32
                    if v.vo then
33
                        local tmp = v.ja[1][1]
34
                        tmp = string.gsub(tmp, "「", "")
35
                        tmp = string.gsub(tmp, "」", "")
36
                        tmp = string.gsub(tmp, "『", "")
37
                        tmp = string.gsub(tmp, "』", "")
38
                        local data = {
39
                            voice_file = v.vo[1].file,
40
                            text = tmp
41
                        }
42
                        table.insert(all_data, data)
43
                    end
44
                end
45
            end
46
        else
47
            print("无法加载文件：" .. path)
48
        end
49
    end
50
end
51

52
-- 导出为 JSON
53
local json_str = json.encode(all_data, {
54
    indent = true
55
})
56
local out = io.open("text_output.json", "w")
57
out:write(json_str)
58
out:close()

运行该脚本即可得到text_output.json

音频格式转换与筛选#

将该脚本与dkjson.lua放置在与script目录同级

IMPORTANT
需要将FFmpeg添加到环境变量，或者自己改脚本里的调用命令吧

1
local lfs = require("lfs")
2
local json = require("dkjson")
3

4
local input_dir = "voice_ogg"
5

6
local output_dir = "voice_wav"
7

8
local json_path = "text_output.json"
9

10
local min_duration = 3 -- 筛选音频最短时长（秒）
11
local max_duration = 30 -- 筛选音频最长时长（秒）
12

13
-- 创建输出目录
14
local attr = lfs.attributes(output_dir)
15
if not attr or attr.mode ~= "directory" then
16
    os.execute('mkdir "' .. output_dir .. '"')
17
end
18

19
-- 读取 JSON
20
local f = io.open(json_path, "r")
21
if not f then
22
    error("无法打开 JSON 文件: " .. json_path)
23
end
24
local content = f:read("*a")
25
f:close()
26

27
local data, pos, err = json.decode(content)
28
if err then
29
    error("JSON 解析失败: " .. err)
30
end
31

32
local allowed = {}
33
for _, entry in ipairs(data) do
34
    allowed[entry.voice_file] = true
35
end
36

37
-- 获取音频时长（秒）
38
local function get_duration(file_path)
39
    local cmd = string.format(
40
        'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "%s"', file_path)
41
    local handle = io.popen(cmd)
42
    local result = handle:read("*a")
43
    handle:close()
44
    return tonumber(result)
45
end
46

47
for file in lfs.dir(input_dir) do
48
    if file:match("%.ogg$") then
49
        local input_path = input_dir .. "/" .. file
50
        local base_name = file:match("^(.*)%.ogg$")
51
        local output_path = output_dir .. "/" .. base_name .. ".wav"
52

53
        if allowed[base_name] then
54
            local duration = get_duration(input_path)
55

56
            if duration and duration >= min_duration and duration <= max_duration then
57
                local cmd = string.format('ffmpeg -i "%s" "%s"', input_path, output_path)
58
                os.execute(cmd)
59
            end
60
        end
61
    end
62
end

运行脚本后我们便有了所有需要的资源，可以拿着它们炼TTS啦

生成GPTSoVits数据集使用的`slicer.list`#

同样放置在同级目录

1
local lfs = require("lfs")
2

3
local json = require("dkjson")
4
local current_dir = lfs.currentdir()
5

6
local wav_dir = "voice_wav"
7

8
local json_path = "text_output.json"
9

10
local output_slicer = "slicer.list"
11

12
-- 读取 JSON
13
local f = io.open(json_path, "r")
14
if not f then
15
    error("无法打开 JSON 文件: " .. json_path)
16
end
17
local content = f:read("*a")
18
f:close()
19

20
local data, pos, err = json.decode(content)
21
if err then
22
    error("JSON 解析失败: " .. err)
23
end
24

25
local vtpairs = {}
26
for _, entry in ipairs(data) do
27
    vtpairs[entry.voice_file] = entry.text
28
end
29

30
local output = io.open(output_slicer, "w")
31

32
for file in lfs.dir(wav_dir) do
33
    if file:match("%.wav$") then
34
        local base_name = file:match("^(.*)%.wav$")
35
        local absolute_path = current_dir .. "\\" .. base_name .. ".wav"
36
        if vtpairs[base_name] then
37
            output:write(absolute_path .. "|slicer|JA|" .. vtpairs[base_name] .. "\n")
38
        end
39
    end
40
end
41

42
output:close()

之后就可以拿去训练GPTSoVits了

顺便贴一下跟AI对线一小时得到的Regex（没用了）#

1
/{["']vo["'],\s*file=["']([^"']+)["'],\s*ch=["']([^"']+)["']}[\s\S]*?ja=\s*\{\{\s*\{\s*["']([^"']+)["']/g

[解包] 常轨脱离(ハミダシ)语音与文本解包制作GPTSoVits数据集

https://a1kari8.github.io/posts/hamidashi_voice_dataset/

作者

A1kari8

发布于

2025-08-29

许可协议

CC BY-NC-SA 4.0

[工科复变函数] 复数基础

[Fedora 42]自签名自定义内核，启用安全启动

提取文本与对应音频文件名生成json的lua脚本

4

音频格式转换与筛选

5

生成GPTSoVits数据集使用的slicer.list

6

顺便贴一下跟AI对线一小时得到的Regex（没用了）

需要准备什么#

解包语音和脚本资源#

提取语音对应的文本#

提取文本与对应音频文件名生成json的lua脚本#

音频格式转换与筛选#

生成GPTSoVits数据集使用的slicer.list#

顺便贴一下跟AI对线一小时得到的Regex（没用了）#

生成GPTSoVits数据集使用的`slicer.list`#