use rand;
use rand::distributions::IndependentSample;
use nalgebra::{Norm};
use vptree::{VPTree, MetricItem};
use std::cmp;
use std::collections::BTreeMap;
use num::Zero;
use model::{Vector};

use super::tsne_util::{Vec2f, cauchy_pdf_d2, find_p_for_perplexity, dist_sq};
use super::tsne_options::{TSNEOptions};
use super::quadtree::{QuadTree};

pub struct BHTSNE {
    options: TSNEOptions,
    pub sigmas: Vec<f32>
}

struct IndexedVec<'a> {
    index: usize,
    vec: &'a Vector<f32>
}

fn norm(v: &Vector<f32>) -> f32 {
    let s2: f32 = v.iter().map(|x| *x * *x).sum();
    s2.sqrt()
}

impl<'a> MetricItem<f32> for IndexedVec<'a> {
    fn distance(&self, other: &Self) -> f32 {
        norm(&(self.vec - other.vec))
    }
}

type SparseBTMatrix = Vec<BTreeMap<usize, f32>>;

impl BHTSNE {
    pub fn new(opt: Option<TSNEOptions>) -> Self {
        BHTSNE {
            options: opt.unwrap_or(TSNEOptions::default()),
            sigmas: Vec::new()
        }
    }

    /// Return the cost of the current mapping and the gradient
    fn cost_gradient(p_sym: &SparseBTMatrix, p_mult: f32,
                     y: &[Vec2f], y_qt: &QuadTree) -> (f32, Vec<Vec2f>) {
        // collect frep_z and z components
        let freps: Vec<_> = y.iter().map(|x| y_qt.compute_frep_quantities(x, 0.5)).collect();

        // Full normalization constant.
        let z: f32 = freps.iter().map(|x| x.1).sum();

        let cost_nomult = p_sym.iter().enumerate().map(|(i, row)| {
            let yi = y[i];
            row.iter().map(|(j, p_ij)| {
                let d = yi - y[*j];
                let q_ij = cauchy_pdf_d2(d.norm_squared()) / z;
                p_ij * (p_ij / q_ij).ln()
            }).sum::<f32>()
        }).sum::<f32>();

        let cost = p_mult * (cost_nomult + p_mult.ln());

        // Gradient
        let grad: Vec<_> = p_sym.iter().enumerate().map(|(i, row)| {
            let p = y[i];

            // compute f_attr (and parts of cost)
            let f_attr = row.iter().map(|(j, v)| {
                let d = p - y[*j];
                let q_z = cauchy_pdf_d2(d.norm_squared());
                (v * q_z) * d
            }).fold(Vec2f::zero(), |acc, x| acc + x);

            // compute dC/di
            let frep: Vec2f = freps[i].0 / z;
            4.0 * (f_attr * p_mult + frep)
        }).collect();

        (cost, grad)
    }

    /// Generate a set of output vectors randomly.
    ///
    /// Uses a bivariate symmetric gaussian distribution, centered at
    /// the origin with a standard deviation of `s`.
    fn initial_result(&self, n: usize, s: f32) -> Vec<Vec2f> {
        let mut rng = rand::thread_rng();
        let normal = rand::distributions::Normal::new(0.0, s as f64);
        (0..n).map(|_| {
            let x = normal.ind_sample(&mut rng) as f32;
            let y = normal.ind_sample(&mut rng) as f32;
            Vec2f::new(x, y)
        }).collect()
    }

    // Compute the joint probabilities on the input distance matrix.
    fn sparse_input_jp_matrix(&mut self, inputs: &[Vector<f32>]) -> SparseBTMatrix {
        let items: Vec<IndexedVec> = inputs.iter().enumerate().map(|(i, v)| IndexedVec{ index: i, vec: v }).collect();
        let ni = inputs.len();
        self.sigmas.clear();

        let tree = VPTree::new(items).unwrap();

        let nnn = cmp::min((self.options.perplexity * 3.0).floor() as usize, ni - 1);

        let mut p_cond: SparseBTMatrix = Vec::with_capacity(ni);

        for i in 0..ni {
            // find the nearest neighbor to each vector
            let target = &inputs[i];
            let mut nn = tree.nearest_neighbors(&IndexedVec{ index: i, vec: target }, nnn+1, true);
            assert!(nn[0].index == i);

            // The first element is should be the element itself, so remove that.
            nn.remove(0);

            // compute the conditional probabilities
            let r2: Vec<f32> = nn.iter().map(|ref x| dist_sq(&target, &x.vec)).collect();
            let v = find_p_for_perplexity(&r2, self.options.perplexity).unwrap();
            self.sigmas.push(v.sigma);
            let mut x = BTreeMap::new();
            for (i, v) in nn.iter().zip(&v.probs) {
                x.insert(i.index, *v);
            }
            p_cond.push(x);
        }

        // Symmetrize the p_cond 'matrix' to get an p_sym 'matrix'.
        let normalization: f32 = 0.5 / (ni as f32);
        let mut p_sym: SparseBTMatrix = (0..ni).map(|_| BTreeMap::new()).collect();

        for (i, row) in p_cond.into_iter().enumerate() {
            for (j, val) in &row {
                {
                    let v_ij = p_sym[i].entry(*j).or_insert(0.0);
                    *v_ij += val * normalization;
                }
                {
                let v_ji = p_sym[*j].entry(i).or_insert(0.0);
                    *v_ji += val * normalization;
                }
            }
        }

        p_sym
    }

    /// Perform the visualization reduction from a set of inputs to
    /// output vectors.
    pub fn reduction(&mut self, inputs: &[Vector<f32>]) -> Vec<Vec2f> {
        let n = inputs.len();
        let mut eta = self.options.learning_rate;

        // Use the squared distance matrix to compute the implied joint
        // probabilities, based on the perplexity
        let p = self.sparse_input_jp_matrix(inputs);

        let early_p_mult = match self.options.early_exaggeration {
            Some(ref eeo) => eeo.mult,
            None => 1.0
        };

        let mut result = self.initial_result(n, self.options.initial_spread);

        let qt = QuadTree::new(&result).unwrap();

        let r = Self::cost_gradient(&p, early_p_mult, &result, &qt);
        let mut cost = r.0;
        let mut grad = r.1;

        let mut prev_result = result.clone();
        let mut new_result = result.clone();

        const MAX_BAD_ITERATIONS: usize = 100;

        for iter in 0..self.options.num_iterations {
            let use_p_mult = match self.options.early_exaggeration {
                Some(ref eeo) =>
                    if iter < eeo.iterations { early_p_mult } else { 1.0 },
                None => 1.0
            };

            let mut bad_iterations = 0;
            while bad_iterations < MAX_BAD_ITERATIONS {
                // Evaluate the new result at the given learning rate.
                for i in 0..n {
                    let m = self.options.momentum.rate(iter);
                    let curr = result[i];
                    let prev = prev_result[i];
                    let g = grad[i];
                    new_result[i] = curr - g * eta - g * m * (curr - prev);
                }

                // Compute the cost and gradient at the new location.
                let new_qt = QuadTree::new(&new_result).unwrap();
                let (new_cost, new_grad) = Self::cost_gradient(&p, use_p_mult, &new_result, &new_qt);

                if new_cost < cost - (n as f32) * 1e-10 {
                    // If the descent was effective, update the
                    // learning rate and use the new result.
                    cost = new_cost;
                    grad = new_grad;

                    eta *= 1.05;
                    break
                } else {
                    let new_eta = eta * 0.5;

                    // Otherwise, decrease the learning rate.
                    eta = new_eta;
                    bad_iterations += 1;
                }
            }
            if bad_iterations >= MAX_BAD_ITERATIONS {
                break;
            }

            // Rotate new_result -> result -> prev_result -> new_results, with no allocations.
            let temp = prev_result;
            prev_result = result;
            result = new_result;
            new_result = temp;
        }

        result
    }
}