Source code for motif.contour_extractors.peak_stream

# -*- coding: utf-8 -*-
"""Salamon's method for extracting contours
from __future__ import print_function

import librosa
import numpy as np
import os
# import scipy.signal
import subprocess
from subprocess import CalledProcessError

from motif.core import ContourExtractor
from motif.core import Contours
from motif.contour_extractors import utils

SALAMON_FPATH = "vamp_melodia-salience_melodia-salience_saliencefunction.csv"
VAMP_PLUGIN = b"vamp:melodia-salience:melodia-salience:saliencefunction"

def _check_binary():
    '''Check if the vamp plugin is available and can be called.

    True if callable, False otherwise

    sonic_annotator_exists = True
        subprocess.check_output(['which', 'sonic-annotator'])
    except CalledProcessError:
        sonic_annotator_exists = False

    if sonic_annotator_exists:
        avail_plugins = subprocess.check_output(["sonic-annotator", "-l"])
        if VAMP_PLUGIN in avail_plugins:
            return True
            return False
        return False

BINARY_AVAILABLE = _check_binary()

[docs]class PeakStream(ContourExtractor): '''Peak streaming based contour extraction as in [1]_ .. [1] Salamon, Justin and Gómez, Emilia, and Bonada, Jordi. "Sinusoid extraction and salience function design for predominant melody estimation." 14th International Conference on Digital Audio Effects (DAFX11), Paris, France, 2011. Parameters ---------- hop_length : int, default=128 Number of samples between frames. win_length : int, default=2048 The window size in samples. n_fft : int, default=8192 The fft size in samples. h_range : list, default=[1, 2, 3, 4, 5] The list of harmonics to use in salience function. h_weights : list, default=[1, 0.5, 0.25, 0.25, 0.25] The list of weights to apply to each harmonic in salience function. pitch_cont : float, default=80 Pitch continuity threshold in cents. max_gap : float, default=0.01 Threshold (in seconds) for how many values can be taken from S-. amp_thresh : float, default=0.9 Threshold on how big a peak must be relative to the maximum in its frame. dev_thresh : float, default=0.9 The maximum number of standard deviations below the mean a peak can be to survive. preprocess : bool, default=True If true, normalizes the volume and format of the audio before processing. Otherwise computes contours from original audio. Attributes ---------- max_freq : float The maximum frequency allowed in a contour in Hz. hop_length : int Number of samples between frames. win_length : int The window size in samples. n_fft : int The fft size in samples. h_range : list The list of harmonics to use in salience function. h_weights : list The list of weights to apply to each harmonic in salience function. interpolation_type : str Frequency interpolation type. See scipy.signal.interp1d for details. pitch_cont : float Pitch continuity threshold in cents. max_gap : float Threshold (in seconds) for how many values can be taken from S-. amp_thresh : float Threshold on how big a peak must be relative to the maximum in its frame. dev_thresh : float The maximum number of standard deviations below the mean a peak can be to survive. preprocess : bool If true, normalizes the volume and format of the audio before processing. Otherwise computes contours from original audio. use_salamon_salience : bool If true, uses salamon vamp plugin to compute salience. ''' def __init__(self, max_freq=3000.0, hop_length=128, win_length=2048, n_fft=8192, h_range=[1, 2, 3, 4, 5], h_weights=[1, 0.5, 0.25, 0.25, 0.25], interpolation_type='linear', pitch_cont=80, max_gap=0.01, amp_thresh=0.9, dev_thresh=0.9, preprocess=True, use_salamon_salience=False): '''Init method. ''' self.max_freq = max_freq # salience function parameters self.hop_length = hop_length self.win_length = win_length self.n_fft = n_fft self.h_range = h_range self.h_weights = h_weights self.interpolation_type = interpolation_type # peak streaming parameters self.pitch_cont = pitch_cont self.max_gap = max_gap self.amp_thresh = amp_thresh self.dev_thresh = dev_thresh self.preprocess = preprocess self.use_salamon_salience = use_salamon_salience ContourExtractor.__init__(self) @property def n_gap(self): """The number of time frames within the maximum gap Returns ------- n_gap : float Number of time frames within the maximum gap. """ return self.max_gap * self.sample_rate @property def audio_samplerate(self): """Sample rate of preprocessed audio. Returns ------- audio_samplerate : float Number of samples per second. """ return 44100.0 @property def sample_rate(self): """Sample rate of output contours Returns ------- sample_rate : float Number of samples per second. """ return self.audio_samplerate / self.hop_length @property def min_contour_len(self): """Minimum allowed contour length. Returns ------- min_contour_len : float Minimum allowed contour length in seconds. """ return 0.1 @classmethod
[docs] def get_id(cls): """Identifier of this extractor. Returns ------- id : str Identifier of this extractor. """ return "peak_stream"
[docs] def compute_contours(self, audio_filepath): """Compute contours as in Justin Salamon's melodia. This calls a vamp plugin in the background, which creates a csv file. The csv file is loaded into memory and the file is deleted. Parameters ---------- audio_filepath : str Path to audio file. Returns ------- Instance of Contours object """ if not os.path.exists(audio_filepath): raise IOError( "The audio file {} does not exist".format(audio_filepath) ) if self.preprocess: fpath = self._preprocess_audio( audio_filepath, normalize_format=True, normalize_volume=True ) else: fpath = audio_filepath print("Computing salience...") if self.use_salamon_salience: times, freqs, S = self._compute_salience_salamon(fpath) else: y, sr = librosa.load(fpath, sr=self.audio_samplerate) times, freqs, S = self._compute_salience(y, sr) psh = utils.PeakStreamHelper( S, times, freqs, self.amp_thresh, self.dev_thresh, self.n_gap, self.pitch_cont, peak_thresh=None ) c_numbers, c_times, c_freqs, c_sal = psh.peak_streaming() if len(c_numbers) > 0: c_numbers, c_times, c_freqs, c_sal = self._sort_contours( np.array(c_numbers), np.array(c_times), np.array(c_freqs), np.array(c_sal) ) (c_numbers, c_times, c_freqs, c_sal) = self._postprocess_contours( c_numbers, c_times, c_freqs, c_sal ) return Contours( c_numbers, c_times, c_freqs, c_sal, self.sample_rate, audio_filepath )
def _compute_salience(self, y, sr): """Computes salience function from audio signal using librosa's salience function. Parameters ---------- y : np.array Audio signal sr : float Audio sample rate Returns ------- times : np.array Array of times in seconds freqs : np.array Array of frequencies in Hz salience : np.array Salience matrix of shape (len(freqs), len(times)) """ # compute stft S = librosa.core.stft(y, n_fft=self.n_fft, hop_length=self.hop_length) freqs = librosa.core.fft_frequencies(sr=sr, n_fft=self.n_fft) times = librosa.core.frames_to_time( np.arange(0, S.shape[1]), sr, hop_length=self.hop_length, n_fft=self.n_fft ) # discard unneeded frequencies max_sal_freq = np.max(self.h_range) * self.max_freq max_sal_freq_index = np.argmin(np.abs(freqs - max_sal_freq)) freqs_reduced = freqs[:max_sal_freq_index] S_sal = librosa.harmonic.salience( np.abs(S[:max_sal_freq_index, :]), freqs_reduced, self.h_range, weights=self.h_weights, kind=self.interpolation_type, filter_peaks=True, fill_value=0.0 ) max_freq_index = np.argmin(np.abs(freqs_reduced - self.max_freq)) return times, freqs_reduced[:max_freq_index], S_sal[:max_freq_index, :] def _compute_salience_salamon(self, fpath): """Computes salience function from audio signal using melodia's salience function. Parameters ---------- fpath : str Path to audio file. Returns ------- times : np.array Array of times in seconds freqs : np.array Array of frequencies in Hz salience : np.array Salience matrix of shape (len(freqs), len(times)) """ if not BINARY_AVAILABLE: raise EnvironmentError( "Either the vamp plugin {} needed to compute these contours or " "sonic-annotator is not available.".format(VAMP_PLUGIN) ) f_dir = os.path.dirname(fpath) f_name = os.path.basename(fpath) fpath_out = os.path.join( f_dir, "{}_{}".format(f_name.split('.')[0], SALAMON_FPATH) ) if os.path.exists(fpath_out): os.remove(fpath_out) binary_call = [ "sonic-annotator", "-d", "vamp:melodia-salience:melodia-salience:saliencefunction", fpath, "-w", "csv", "--csv-force" ] os.system(" ".join(binary_call)) if not os.path.exists(fpath_out): raise IOError("output file does not exist") else: S_sal = np.loadtxt(fpath_out, dtype=float, delimiter=',') S_sal = (S_sal / np.max(S_sal, axis=0)).T times = librosa.core.frames_to_time( np.arange(0, S_sal.shape[1]), 44100, hop_length=128 ) freqs = 55.0 * np.power(2.0, (np.arange(0, 601)) / 120.0) os.remove(fpath_out) return times, freqs, S_sal