基于vad对音频进行切割

why?

通过ai生成的音频上传到X音音乐,上传片段超过60s

So

基于vad停顿切分音频,分段上传

"""
pip install pydub
pip install numpy
pip install librosa
pip install soundfile
"""
from pydub import AudioSegment
from pydub.silence import split_on_silence
import numpy as np


def dynamic_audio_split(file_path, output_dir="output", max_length=50, min_length=5):
    """Intelligent audio segmentation with adaptive parameters"""
    # Audio loading and initialization
    audio = AudioSegment.from_file(file_path)
    segments = []

    # Adaptive parameter settings
    base_params = {
        'silence_thresh': -40,
        'min_silence': 600,
        'keep_silence': 200,
        'step_size': 50
    }

    # Phase 1: Primary segmentation
    raw_chunks = split_on_silence(
        audio,
        silence_thresh=base_params['silence_thresh'],
        min_silence_len=base_params['min_silence'],
        keep_silence=base_params['keep_silence'],
        seek_step=base_params['step_size']
    )

    # Phase 2: Dynamic processing
    buffer = raw_chunks[0]
    for chunk in raw_chunks[1:]:
        projected_length = (len(buffer) + len(chunk)) / 1000

        # Adaptive parameter adjustment
        current_params = dynamic_param_tuning(buffer, base_params)

        if projected_length > max_length:
            split_point = find_optimal_split(buffer, max_length)
            if split_point:
                segments.append(buffer[:split_point])
                buffer = buffer[split_point:] + chunk
            else:
                segments.extend(force_split(buffer, max_length))
                buffer = chunk
        elif projected_length < min_length:
            buffer += chunk
        else:
            segments.append(buffer)
            buffer = chunk

    # Final processing
    if len(buffer) >= min_length * 1000:
        if len(buffer) > max_length * 1000:
            segments.extend(force_split(buffer, max_length))
        else:
            segments.append(buffer)

    # Validation and export
    validate_and_export(segments, output_dir, min_length, max_length)
    return segments


def dynamic_param_tuning(buffer, base_params):
    """Adjust parameters based on buffer characteristics"""
    params = base_params.copy()
    buffer_duration = len(buffer) / 1000

    # Adjust sensitivity based on buffer length
    if buffer_duration > 30:
        params['silence_thresh'] += 5  # Increase sensitivity
        params['min_silence'] = max(300, params['min_silence'] - 100)
    elif buffer_duration < 10:
        params['silence_thresh'] -= 3  # Decrease sensitivity
        params['min_silence'] += 150

    return params


def find_optimal_split(chunk, max_length):
    """Find the best split point near target length"""
    target_point = max_length * 1000
    waveform = np.array(chunk.get_array_of_samples())

    # Search ±2 seconds around target length
    for offset in range(-2000, 2000, 100):
        current_pos = min(target_point + offset, len(chunk))
        if current_pos < 5000 or current_pos > (max_length * 1000):
            continue

        # Check for silent window
        if is_silent_zone(waveform, current_pos, chunk.frame_rate):
            return int(current_pos)
    return None


def is_silent_zone(waveform, position, sr, window=500):
    """Detect silence around specified position"""
    start = max(0, int(position - window))
    end = min(len(waveform), int(position + window))
    segment = waveform[start:end]
    rms = np.sqrt(np.mean(segment ** 2))
    return rms < (0.02 * np.max(np.abs(waveform)))


def force_split(chunk, max_length):
    """Split long chunks at regular intervals"""
    max_samples = max_length * 1000
    return [chunk[i:i + max_samples] for i in range(0, len(chunk), max_samples)]


def validate_and_export(segments, output_dir, min_len, max_len):
    """Quality check and file export"""
    for idx, seg in enumerate(segments):
        duration = len(seg) / 1000
        if duration < min_len or duration > max_len:
            raise ValueError(f"Segment {idx} invalid: {duration}s")
        seg.export(f"{output_dir}/segmentes_{idx + 1}.wav", format="wav")


dynamic_audio_split("1.wav", max_length=50, min_length=5)

评论