Install
openclaw skills install audiobook-generatorGenerate audiobooks from novels and long-form text with chapter management and character voices. Use when users mention audiobooks, narrating books, or converting lengthy written content to audio.
openclaw skills install audiobook-generatorConvert novels, books, and long-form written content into professional audiobooks with natural narration and character voices.
This skill helps you create audiobooks by:
Use this skill when you need to:
Structure your content like this:
# Chapter 1: The Beginning
[Narrator]
It was a dark and stormy night. The old mansion stood alone on the hill.
[Character: John, Voice: male_0004_a]
"I don't think we should go in there," John whispered nervously.
[Character: Sarah, Voice: female_0006_a]
"Don't be silly. It's just an old house."
[Narrator]
They pushed open the creaky door and stepped inside.
Narrator voice: Choose a clear, neutral voice for narration
Character voices: Assign distinct voices to each character
For long books, process by chapters:
chapters = [
{"title": "Chapter 1", "text": "...", "start_time": 0},
{"title": "Chapter 2", "text": "...", "start_time": 1234},
# ...
]
Benefits:
Extract structure from the text:
import re
def parse_audiobook_text(text):
sections = []
current_section = {"type": "narrator", "text": "", "voice": "male_0004_a"}
for line in text.split('\n'):
# Chapter markers
if line.startswith('# Chapter'):
if current_section["text"]:
sections.append(current_section)
sections.append({"type": "chapter", "title": line[2:]})
current_section = {"type": "narrator", "text": "", "voice": "male_0004_a"}
# Narrator
elif line.startswith('[Narrator]'):
if current_section["text"]:
sections.append(current_section)
current_section = {"type": "narrator", "text": "", "voice": "male_0004_a"}
# Character dialogue
elif match := re.match(r'\[Character: (\w+), Voice: ([\w_]+)\]', line):
if current_section["text"]:
sections.append(current_section)
current_section = {
"type": "character",
"character": match.group(1),
"voice": match.group(2),
"text": ""
}
else:
current_section["text"] += line + " "
if current_section["text"]:
sections.append(current_section)
return sections
For each section, call the TTS API:
import requests
import binascii
def generate_audio_segment(text, voice_id, output_file, speed=1.0):
url = "https://api.senseaudio.cn/v1/t2a_v2"
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
}
payload = {
"model": "SenseAudio-TTS-1.0",
"text": text,
"stream": False,
"voice_setting": {
"voice_id": voice_id,
"speed": speed,
"vol": 1.0,
"pitch": 0
},
"audio_setting": {
"format": "mp3",
"sample_rate": 32000,
"bitrate": 128000,
"channel": 2
}
}
response = requests.post(url, headers=headers, json=payload)
data = response.json()
# Decode hex audio
audio_hex = data['data']['audio']
audio_binary = binascii.unhexlify(audio_hex)
# Save to file
with open(output_file, 'wb') as f:
f.write(audio_binary)
return data['extra_info']['audio_length'] # Duration in ms
Split long chapters into manageable chunks:
def chunk_text(text, max_length=5000):
"""Split text at sentence boundaries"""
sentences = re.split(r'([.!?]+\s+)', text)
chunks = []
current_chunk = ""
for i in range(0, len(sentences), 2):
sentence = sentences[i]
separator = sentences[i+1] if i+1 < len(sentences) else ""
if len(current_chunk) + len(sentence) + len(separator) > max_length:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence + separator
else:
current_chunk += sentence + separator
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
Merge all segments into a complete audiobook:
from pydub import AudioSegment
def create_audiobook(segments, output_file, chapter_markers=None):
audiobook = AudioSegment.empty()
chapter_times = []
current_time = 0
for i, segment_file in enumerate(segments):
# Load segment
segment = AudioSegment.from_mp3(segment_file)
# Check if this is a chapter start
if chapter_markers and i in chapter_markers:
chapter_times.append({
"chapter": chapter_markers[i],
"time": current_time
})
# Add segment
audiobook += segment
current_time += len(segment)
# Add pause between segments (300ms)
if i < len(segments) - 1:
audiobook += AudioSegment.silent(duration=300)
current_time += 300
# Export with metadata
audiobook.export(
output_file,
format="mp3",
bitrate="128k",
tags={
"title": "Audiobook",
"artist": "SenseAudio TTS",
"album": "Generated Audiobook"
}
)
return chapter_times
Make dialogue more natural:
<break time=200>Adjust narrator voice for different genres:
Add ID3 tags for chapter navigation:
from mutagen.mp3 import MP3
from mutagen.id3 import ID3, CHAP, TIT2, CTOCFlags
def add_chapter_markers(mp3_file, chapters):
audio = MP3(mp3_file, ID3=ID3)
# Add chapters
for i, chapter in enumerate(chapters):
audio.tags.add(
CHAP(
element_id=f"chp{i}",
start_time=chapter["time"],
end_time=chapters[i+1]["time"] if i+1 < len(chapters) else len(audio.info.length * 1000),
sub_frames=[TIT2(text=chapter["chapter"])]
)
)
audio.save()
For long books, track progress:
def generate_audiobook_with_progress(sections, output_dir):
total = len(sections)
completed = 0
for i, section in enumerate(sections):
output_file = f"{output_dir}/segment_{i:04d}.mp3"
# Skip if already generated
if os.path.exists(output_file):
completed += 1
continue
# Generate audio
generate_audio_segment(
section["text"],
section["voice"],
output_file=output_file
)
completed += 1
print(f"Progress: {completed}/{total} ({completed*100//total}%)")
Common issues:
Text too long: Chunk into smaller segments (max 10,000 chars per request)
Voice consistency: Save voice mappings to ensure same character uses same voice
Memory issues: Process chapters separately, don't load entire book at once
API rate limits: Add delays between requests (e.g., 1 second)
The skill produces:
User request: "Convert this novel into an audiobook with different voices for each character"
Skill actions:
For detailed API documentation, see the SenseAudio TTS API reference in the references/ directory.