rustframe/compute/models/
pca.rs

1//! Principal Component Analysis using covariance matrices.
2//!
3//! ```
4//! use rustframe::compute::models::pca::PCA;
5//! use rustframe::matrix::Matrix;
6//!
7//! let data = Matrix::from_rows_vec(vec![1.0, 1.0, 2.0, 2.0], 2, 2);
8//! let pca = PCA::fit(&data, 1, 0);
9//! let projected = pca.transform(&data);
10//! assert_eq!(projected.cols(), 1);
11//! ```
12use crate::compute::stats::correlation::covariance_matrix;
13use crate::compute::stats::descriptive::mean_vertical;
14use crate::matrix::{Axis, Matrix, SeriesOps};
15
16/// Returns the `n_components` principal axes (rows) and the centred data's mean.
17pub struct PCA {
18    pub components: Matrix<f64>, // (n_components, n_features)
19    pub mean: Matrix<f64>,       // (1, n_features)
20}
21
22impl PCA {
23    pub fn fit(x: &Matrix<f64>, n_components: usize, _iters: usize) -> Self {
24        let mean = mean_vertical(x); // Mean of each feature (column)
25        let broadcasted_mean = mean.broadcast_row_to_target_shape(x.rows(), x.cols());
26        let centered_data = x.zip(&broadcasted_mean, |x_i, mean_i| x_i - mean_i);
27        let covariance_matrix = covariance_matrix(&centered_data, Axis::Col); // Covariance between features
28
29        let mut components = Matrix::zeros(n_components, x.cols());
30        for i in 0..n_components {
31            if i < covariance_matrix.rows() {
32                components.row_copy_from_slice(i, &covariance_matrix.row(i));
33            } else {
34                break;
35            }
36        }
37
38        PCA { components, mean }
39    }
40
41    /// Project new data on the learned axes.
42    pub fn transform(&self, x: &Matrix<f64>) -> Matrix<f64> {
43        let broadcasted_mean = self.mean.broadcast_row_to_target_shape(x.rows(), x.cols());
44        let centered_data = x.zip(&broadcasted_mean, |x_i, mean_i| x_i - mean_i);
45        centered_data.matrix_mul(&self.components.transpose())
46    }
47}
48
49#[cfg(test)]
50mod tests {
51    use super::*;
52    use crate::matrix::Matrix;
53
54    const EPSILON: f64 = 1e-8;
55
56    #[test]
57    fn test_pca_basic() {
58        // Simple 2D data with points along the y = x line
59        let data = Matrix::from_rows_vec(vec![1.0, 1.0, 2.0, 2.0, 3.0, 3.0], 3, 2);
60        let (_n_samples, _n_features) = data.shape();
61
62        let pca = PCA::fit(&data, 1, 0); // n_components = 1, iters is unused
63
64        println!("Data shape: {:?}", data.shape());
65        println!("PCA mean shape: {:?}", pca.mean.shape());
66        println!("PCA components shape: {:?}", pca.components.shape());
67
68        // Expected mean: (2.0, 2.0)
69        assert!((pca.mean.get(0, 0) - 2.0).abs() < EPSILON);
70        assert!((pca.mean.get(0, 1) - 2.0).abs() < EPSILON);
71
72        // For data along y=x, the principal component should be proportional to (1/sqrt(2), 1/sqrt(2)) or (1,1)
73        // The covariance matrix will be:
74        // [[1.0, 1.0],
75        //  [1.0, 1.0]]
76        // The principal component (eigenvector) will be (0.707, 0.707) or (-0.707, -0.707)
77        // Since we are taking the row from the covariance matrix directly, it will be (1.0, 1.0)
78        assert!((pca.components.get(0, 0) - 1.0).abs() < EPSILON);
79        assert!((pca.components.get(0, 1) - 1.0).abs() < EPSILON);
80
81        // Test transform: centered data projects to [-2.0, 0.0, 2.0]
82        let transformed_data = pca.transform(&data);
83        assert_eq!(transformed_data.rows(), 3);
84        assert_eq!(transformed_data.cols(), 1);
85        assert!((transformed_data.get(0, 0) - -2.0).abs() < EPSILON);
86        assert!((transformed_data.get(1, 0) - 0.0).abs() < EPSILON);
87        assert!((transformed_data.get(2, 0) - 2.0).abs() < EPSILON);
88    }
89
90    #[test]
91    fn test_pca_fit_break_branch() {
92        // Data with 2 features
93        let data = Matrix::from_rows_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0], 3, 2);
94        let (_n_samples, n_features) = data.shape();
95
96        // Set n_components greater than n_features to trigger the break branch
97        let n_components_large = n_features + 1;
98        let pca = PCA::fit(&data, n_components_large, 0);
99
100        // The components matrix should be initialized with n_components_large rows,
101        // but only the first n_features rows should be copied from the covariance matrix.
102        // The remaining rows should be zeros.
103        assert_eq!(pca.components.rows(), n_components_large);
104        assert_eq!(pca.components.cols(), n_features);
105
106        // Verify that rows beyond n_features are all zeros
107        for i in n_features..n_components_large {
108            for j in 0..n_features {
109                assert!((pca.components.get(i, j) - 0.0).abs() < EPSILON);
110            }
111        }
112    }
113}