use rand;
use rand::distributions::IndependentSample;
use nalgebra::{Norm};
use util::remove_diagonal;

use super::tsne_util::{Vec2f, cauchy_pdf_d2, find_p_for_perplexity, dist_sq};
use super::tsne_options::{TSNEOptions};
use model::{Vector, Matrix};

pub struct TSNE {
    options: TSNEOptions,
}

// Compute the joint probabilities on the input distance matrix.
pub fn input_joint_probability_matrix(inputs: &[Vector<f32>], perplexity: f32) -> Matrix<f32> {
    let n = inputs.len();

    // Compute the squared distance matrix
    let mut d2 = Matrix::zeros((n,n));
    for i in 0..n {
        let xi = &inputs[i];
        for j in 0..i {
            let d = dist_sq(xi, &inputs[j]);
            assert!(d > 0.0);
            d2[[i, j]] = d;
            d2[[j, i]] = d;
        }
    }

    let m = remove_diagonal(d2);
    let n = m.rows();
    let mut p: Vec<f32> = Vec::with_capacity(n*n);


    // Compute the conditional probabilities for each row
    for (i, r) in m.outer_iter().enumerate() {
        let s: &[f32] = r.as_slice().unwrap();
        let v = find_p_for_perplexity(s, perplexity).unwrap();
        p.extend_from_slice(&v.probs[0..i]);
        p.push(0.0);
        p.extend_from_slice(&v.probs[i..]);
    }

    let f : f32 = 1.0 / (2.0 * (n as f32));
    let pm = Matrix::from_shape_vec((n, n), p).ok().unwrap();

    // Return the symmetrized version
    let mut p_sym = Matrix::zeros((n, n));
    for i in 0..n {
        for j in 0..i {
            let x = (pm[(i, j)] + pm[[j, i]]) * f;

            p_sym[[i, j]] = x;
            p_sym[[j, i]] = x;
        }
    }

    p_sym
}


// Given the output vectors, compute the output matrix q.
pub fn output_q_matrix(v: &Vec<Vec2f>) -> Matrix<f32> {
    let n = v.len();
    let mut c = Matrix::zeros((n, n));
    for i in 0..n {
        for j in 0..i {
            let d2 = (v[i] - v[j]).norm_squared();
            let w = cauchy_pdf_d2(d2);
            c[[i, j]] = w;
            c[[j, i]] = w;
        }
    }
    let z: f32 = c.iter().sum();
    c / z
}

/// Return the cost of the current mapping, via the KL-divergence.
pub fn cost_function(p_sym: &Matrix<f32>, q: &Matrix<f32>) -> f32 {
    let n = p_sym.rows();
    assert_eq!(n, q.rows());
    (0..n).map(|i| {
        (0..n).map(|j| {
            if i != j && p_sym[[i, j]] > 0.0 {
                p_sym[[i, j]] * (p_sym[[i, j]] / q[[i, j]]).ln()
            }
            else {
                0.0
            }
        }).sum::<f32>()
    }).sum()
}

impl TSNE {
    pub fn new(opt: Option<TSNEOptions>) -> Self {
        TSNE {
            options: opt.unwrap_or(TSNEOptions::default())
        }
    }

    /// Return the derivative of the cost functions with respect to
    /// the i'th output vector.
    fn dcost(p_sym: &Matrix<f32>, q: &Matrix<f32>, y: &Vec<Vec2f>, i: usize) -> Vec2f {
        let x = &y[i];
        y.iter().enumerate().map(|(j, e)| {
            let to_vec = *x - *e;
            let d = cauchy_pdf_d2( to_vec.norm_squared() );
            let pdiff = p_sym[[i, j]] - q[[i, j]];
            to_vec * (pdiff * d)
        }).fold(Vec2f::new(0.0, 0.0), |a, b| { a + b }) * 4.0
    }

    /// Return the full gradient of the cost function
    fn full_gradient(p_sym: &Matrix<f32>, q: &Matrix<f32>, y: &Vec<Vec2f>) -> Vec<Vec2f> {
        let n = p_sym.rows();
        (0..n).map(|i| Self::dcost(p_sym, q, y, i)).collect()

    }

    /// Generate a set of output vectors randomly.
    ///
    /// Uses a bivariate symmetric gaussian distribution, centered at
    /// the origin with a standard deviation of `s`.
    fn initial_result(&self, n: usize, s: f32) -> Vec<Vec2f> {
        let mut rng = rand::thread_rng();
        let normal = rand::distributions::Normal::new(0.0, s as f64);
        (0..n).map(|_| {
            let x = normal.ind_sample(&mut rng) as f32;
            let y = normal.ind_sample(&mut rng) as f32;
            Vec2f::new(x, y)
        }).collect()
    }


    /// Perform the visualization reduction from a set of inputs to
    /// output vectors.
    pub fn reduction(&mut self, inputs: &[Vector<f32>]) -> Vec<Vec2f> {
        let n = inputs.len();
        let mut eta = self.options.learning_rate;

        // Use the squared distance matrix to compute the implied joint
        // probabilities, based on the perplexity
        let p = input_joint_probability_matrix(inputs, self.options.perplexity);

        let early_p = match self.options.early_exaggeration {
            Some(ref eeo) => &p * eeo.mult,
            None => p.clone()
        };

        let mut result = self.initial_result(n, self.options.initial_spread);

        let mut q = output_q_matrix(&result);

        let mut cost = cost_function(&early_p, &q);

        let mut prev_result = result.clone();
        let mut new_result = result.clone();

        for iter in 0..self.options.num_iterations {
            let use_p = match self.options.early_exaggeration {
                Some(ref eeo) =>
                    if iter < eeo.iterations { &early_p } else { &p },
                None => &p
            };

            // Compute the gradient at the current result
            let grad = Self::full_gradient(&use_p, &q, &result);

            loop {
                // Evaluate the new result at the given learning rate.
                for i in 0..n {
                    let m = self.options.momentum.rate(iter);
                    let curr = result[i];
                    let prev = prev_result[i];
                    let g = grad[i];
                    new_result[i] = curr - g * eta - g * m * (curr - prev);
                }

                let new_q = output_q_matrix(&new_result);
                let new_cost = cost_function(&use_p, &new_q);

                if new_cost < cost - (n as f32) * 1e-10 {
                    // If the descent was effective, update the
                    // learning rate and use the new result.
                    q = new_q;
                    cost = new_cost;

                    eta *= 1.05;
                    break
                } else {
                    let new_eta = eta * 0.5;

                    // Otherwise, decrease the learning rate.
                    eta = new_eta;
                }
            }
            prev_result = result;
            result = new_result.clone();
        }

        result
    }
}