Source code for motif.contour_extractors.peak_stream

# -*- coding: utf-8 -*-
"""Salamon's method for extracting contours
"""
from __future__ import print_function

import librosa
import numpy as np
import os
# import scipy.signal
import subprocess
from subprocess import CalledProcessError

from motif.core import ContourExtractor
from motif.core import Contours
from motif.contour_extractors import utils


SALAMON_FPATH = "vamp_melodia-salience_melodia-salience_saliencefunction.csv"
VAMP_PLUGIN = b"vamp:melodia-salience:melodia-salience:saliencefunction"


def _check_binary():
    '''Check if the vamp plugin is available and can be called.

    Returns
    -------
    True if callable, False otherwise

    '''
    sonic_annotator_exists = True
    try:
        subprocess.check_output(['which', 'sonic-annotator'])
    except CalledProcessError:
        sonic_annotator_exists = False

    if sonic_annotator_exists:
        avail_plugins = subprocess.check_output(["sonic-annotator", "-l"])
        if VAMP_PLUGIN in avail_plugins:
            return True
        else:
            return False
    else:
        return False


BINARY_AVAILABLE = _check_binary()


[docs]class PeakStream(ContourExtractor):
    '''Peak streaming based contour extraction as in [1]_

    .. [1] Salamon, Justin and Gómez, Emilia, and Bonada, Jordi.
        "Sinusoid extraction and salience function design for predominant
        melody estimation." 14th International Conference on Digital Audio
        Effects (DAFX11), Paris, France, 2011.

    Parameters
    ----------
    hop_length : int, default=128
        Number of samples between frames.
    win_length : int, default=2048
        The window size in samples.
    n_fft : int, default=8192
        The fft size in samples.
    h_range : list, default=[1, 2, 3, 4, 5]
        The list of harmonics to use in salience function.
    h_weights : list, default=[1, 0.5, 0.25, 0.25, 0.25]
        The list of weights to apply to each harmonic in salience function.
    pitch_cont : float, default=80
        Pitch continuity threshold in cents.
    max_gap : float, default=0.01
        Threshold (in seconds) for how many values can be taken from S-.
    amp_thresh : float, default=0.9
        Threshold on how big a peak must be relative to the maximum in its
        frame.
    dev_thresh : float, default=0.9
        The maximum number of standard deviations below the mean a peak can
        be to survive.
    preprocess : bool, default=True
        If true, normalizes the volume and format of the audio before
        processing. Otherwise computes contours from original audio.

    Attributes
    ----------
    max_freq : float
        The maximum frequency allowed in a contour in Hz.
    hop_length : int
        Number of samples between frames.
    win_length : int
        The window size in samples.
    n_fft : int
        The fft size in samples.
    h_range : list
        The list of harmonics to use in salience function.
    h_weights : list
        The list of weights to apply to each harmonic in salience function.
    interpolation_type : str
        Frequency interpolation type. See scipy.signal.interp1d for details.
    pitch_cont : float
        Pitch continuity threshold in cents.
    max_gap : float
        Threshold (in seconds) for how many values can be taken from S-.
    amp_thresh : float
        Threshold on how big a peak must be relative to the maximum in its
        frame.
    dev_thresh : float
        The maximum number of standard deviations below the mean a peak can
        be to survive.
    preprocess : bool
        If true, normalizes the volume and format of the audio before
        processing. Otherwise computes contours from original audio.
    use_salamon_salience : bool
        If true, uses salamon vamp plugin to compute salience.

    '''
    def __init__(self, max_freq=3000.0, hop_length=128, win_length=2048,
                 n_fft=8192, h_range=[1, 2, 3, 4, 5],
                 h_weights=[1, 0.5, 0.25, 0.25, 0.25],
                 interpolation_type='linear', pitch_cont=80, max_gap=0.01,
                 amp_thresh=0.9, dev_thresh=0.9, preprocess=True,
                 use_salamon_salience=False):
        '''Init method.
        '''

        self.max_freq = max_freq

        # salience function parameters
        self.hop_length = hop_length
        self.win_length = win_length
        self.n_fft = n_fft
        self.h_range = h_range
        self.h_weights = h_weights
        self.interpolation_type = interpolation_type

        # peak streaming parameters
        self.pitch_cont = pitch_cont
        self.max_gap = max_gap
        self.amp_thresh = amp_thresh
        self.dev_thresh = dev_thresh

        self.preprocess = preprocess
        self.use_salamon_salience = use_salamon_salience

        ContourExtractor.__init__(self)

    @property
    def n_gap(self):
        """The number of time frames within the maximum gap

        Returns
        -------
        n_gap : float
            Number of time frames within the maximum gap.

        """
        return self.max_gap * self.sample_rate

    @property
    def audio_samplerate(self):
        """Sample rate of preprocessed audio.

        Returns
        -------
        audio_samplerate : float
            Number of samples per second.

        """
        return 44100.0

    @property
    def sample_rate(self):
        """Sample rate of output contours

        Returns
        -------
        sample_rate : float
            Number of samples per second.

        """
        return self.audio_samplerate / self.hop_length

    @property
    def min_contour_len(self):
        """Minimum allowed contour length.

        Returns
        -------
        min_contour_len : float
            Minimum allowed contour length in seconds.

        """
        return 0.1

    @classmethod
[docs]    def get_id(cls):
        """Identifier of this extractor.

        Returns
        -------
        id : str
            Identifier of this extractor.

        """
        return "peak_stream"

[docs]    def compute_contours(self, audio_filepath):
        """Compute contours as in Justin Salamon's melodia.
        This calls a vamp plugin in the background, which creates a csv file.
        The csv file is loaded into memory and the file is deleted.

        Parameters
        ----------
        audio_filepath : str
            Path to audio file.

        Returns
        -------
        Instance of Contours object

        """
        if not os.path.exists(audio_filepath):
            raise IOError(
                "The audio file {} does not exist".format(audio_filepath)
            )

        if self.preprocess:
            fpath = self._preprocess_audio(
                audio_filepath, normalize_format=True,
                normalize_volume=True
            )
        else:
            fpath = audio_filepath

        print("Computing salience...")
        if self.use_salamon_salience:
            times, freqs, S = self._compute_salience_salamon(fpath)
        else:
            y, sr = librosa.load(fpath, sr=self.audio_samplerate)
            times, freqs, S = self._compute_salience(y, sr)

        psh = utils.PeakStreamHelper(
            S, times, freqs, self.amp_thresh, self.dev_thresh, self.n_gap,
            self.pitch_cont, peak_thresh=None
        )

        c_numbers, c_times, c_freqs, c_sal = psh.peak_streaming()
        if len(c_numbers) > 0:
            c_numbers, c_times, c_freqs, c_sal = self._sort_contours(
                np.array(c_numbers), np.array(c_times), np.array(c_freqs),
                np.array(c_sal)
            )
            (c_numbers, c_times, c_freqs, c_sal) = self._postprocess_contours(
                c_numbers, c_times, c_freqs, c_sal
            )

        return Contours(
            c_numbers, c_times, c_freqs, c_sal, self.sample_rate,
            audio_filepath
        )

    def _compute_salience(self, y, sr):
        """Computes salience function from audio signal using librosa's
        salience function.

        Parameters
        ----------
        y : np.array
            Audio signal
        sr : float
            Audio sample rate

        Returns
        -------
        times : np.array
            Array of times in seconds
        freqs : np.array
            Array of frequencies in Hz
        salience : np.array
            Salience matrix of shape (len(freqs), len(times))

        """
        # compute stft
        S = librosa.core.stft(y, n_fft=self.n_fft, hop_length=self.hop_length)
        freqs = librosa.core.fft_frequencies(sr=sr, n_fft=self.n_fft)
        times = librosa.core.frames_to_time(
            np.arange(0, S.shape[1]), sr, hop_length=self.hop_length,
            n_fft=self.n_fft
        )

        # discard unneeded frequencies
        max_sal_freq = np.max(self.h_range) * self.max_freq
        max_sal_freq_index = np.argmin(np.abs(freqs - max_sal_freq))
        freqs_reduced = freqs[:max_sal_freq_index]

        S_sal = librosa.harmonic.salience(
            np.abs(S[:max_sal_freq_index, :]), freqs_reduced,
            self.h_range, weights=self.h_weights, kind=self.interpolation_type,
            filter_peaks=True, fill_value=0.0
        )

        max_freq_index = np.argmin(np.abs(freqs_reduced - self.max_freq))
        return times, freqs_reduced[:max_freq_index], S_sal[:max_freq_index, :]

    def _compute_salience_salamon(self, fpath):
        """Computes salience function from audio signal using melodia's
        salience function.

        Parameters
        ----------
        fpath : str
            Path to audio file.

        Returns
        -------
        times : np.array
            Array of times in seconds
        freqs : np.array
            Array of frequencies in Hz
        salience : np.array
            Salience matrix of shape (len(freqs), len(times))

        """
        if not BINARY_AVAILABLE:
            raise EnvironmentError(
                "Either the vamp plugin {} needed to compute these contours or "
                "sonic-annotator is not available.".format(VAMP_PLUGIN)
            )

        f_dir = os.path.dirname(fpath)
        f_name = os.path.basename(fpath)
        fpath_out = os.path.join(
            f_dir,
            "{}_{}".format(f_name.split('.')[0], SALAMON_FPATH)
        )
        if os.path.exists(fpath_out):
            os.remove(fpath_out)

        binary_call = [
            "sonic-annotator", "-d",
            "vamp:melodia-salience:melodia-salience:saliencefunction",
            fpath, "-w", "csv", "--csv-force"
        ]
        os.system(" ".join(binary_call))
        if not os.path.exists(fpath_out):
            raise IOError("output file does not exist")
        else:
            S_sal = np.loadtxt(fpath_out, dtype=float, delimiter=',')
            S_sal = (S_sal / np.max(S_sal, axis=0)).T
            times = librosa.core.frames_to_time(
                np.arange(0, S_sal.shape[1]), 44100, hop_length=128
            )
            freqs = 55.0 * np.power(2.0, (np.arange(0, 601)) / 120.0)
            os.remove(fpath_out)
        return times, freqs, S_sal