| 1 |
#' @useDynLib libqe, .registration = TRUE |
|
| 2 |
#' @importFrom Rcpp evalCpp |
|
| 3 |
NULL |
|
| 4 | ||
| 5 |
.onLoad <- function(libname, pkgname) {
|
|
| 6 |
# Nothing to initialise — libqe is a header + Rcpp library. |
|
| 7 | ! |
invisible(NULL) |
| 8 |
} |
| 1 |
/** @file accumulation.hpp |
|
| 2 |
* @brief Accumulation primitives shared between the rENA stanza-window model |
|
| 3 |
* and the tma ground/response/tensor model. |
|
| 4 |
* |
|
| 5 |
* Both models reduce to the same core connection_matrix() operation; they |
|
| 6 |
* differ in how ground and response vectors are assembled from the raw data. |
|
| 7 |
*/ |
|
| 8 |
#ifndef LIBQE_ACCUMULATION_HPP |
|
| 9 |
#define LIBQE_ACCUMULATION_HPP |
|
| 10 | ||
| 11 |
#include <armadillo> |
|
| 12 |
#include <functional> |
|
| 13 |
#include <vector> |
|
| 14 |
#include "adjacency.hpp" |
|
| 15 | ||
| 16 |
namespace qe {
|
|
| 17 | ||
| 18 |
/// @name Core adjacency math |
|
| 19 |
/// @{
|
|
| 20 | ||
| 21 |
/** @brief Compute the connection matrix for one ground + response event pair. |
|
| 22 |
* |
|
| 23 |
* @param[in] ground Row vector of accumulated ground codes (length p). |
|
| 24 |
* @param[in] response Row vector of the focal (response) codes (length p). |
|
| 25 |
* @param[in] response_weight Scalar weight applied to the response self-connection |
|
| 26 |
* term. Defaults to 1.0. |
|
| 27 |
* @param[in] ordered When @c true, produces a directed (asymmetric) matrix: |
|
| 28 |
* @c ground→response cross-product plus |
|
| 29 |
* @c 0.5 * response_weight * response self-connection |
|
| 30 |
* (diagonal zeroed). |
|
| 31 |
* When @c false, produces a symmetric undirected matrix: |
|
| 32 |
* @c (response_weight * r⊗r) + g⊗r + r⊗g. |
|
| 33 |
* |
|
| 34 |
* @returns A p × p connection matrix. |
|
| 35 |
* |
|
| 36 |
* @note Equivalent to @c calculate_adjacency_matrix() in @c tma/code.cpp. |
|
| 37 |
*/ |
|
| 38 | 24x |
inline arma::mat connection_matrix( |
| 39 |
arma::rowvec ground, arma::rowvec response, |
|
| 40 |
double response_weight = 1.0, bool ordered = true |
|
| 41 |
) {
|
|
| 42 | 24x |
arma::mat resp_self = response.t() * response; |
| 43 | 24x |
arma::mat g_by_r = ground.t() * response; |
| 44 | ||
| 45 | 24x |
if (ordered) {
|
| 46 | 19x |
resp_self.diag().zeros(); |
| 47 | 76x |
return g_by_r + (0.5 * response_weight * resp_self); |
| 48 |
} |
|
| 49 | 10x |
return (response_weight * resp_self) + g_by_r + (response.t() * ground); |
| 50 |
} |
|
| 51 | ||
| 52 |
/// @} |
|
| 53 | ||
| 54 |
/// @name Traditional stanza-window accumulation (rENA model) |
|
| 55 |
/// @{
|
|
| 56 | ||
| 57 |
/** @brief Accumulate connection vectors for every row in one conversation using |
|
| 58 |
* a stanza window. |
|
| 59 |
* |
|
| 60 |
* For each focal row @c k in @p codes the function assembles a ground context |
|
| 61 |
* from the surrounding window and calls connection_matrix(). |
|
| 62 |
* |
|
| 63 |
* @par Ordering semantics |
|
| 64 |
* - @b Undirected (@p ordered = @c false, default — rENA stanza model): the |
|
| 65 |
* window spans [@c k - window_back, @c k + window_forward]; the upper-triangle |
|
| 66 |
* outer-product (code_connections) is computed and back/forward-reference |
|
| 67 |
* corrections are subtracted. Returns a matrix of shape |
|
| 68 |
* @c n_rows × choose_two(n_codes). |
|
| 69 |
* - @b Directed (@p ordered = @c true): focal row @c k is the response; the |
|
| 70 |
* sum of prior rows [@c earliest, @c k−1] is the ground. |
|
| 71 |
* connection_matrix() is called with @p ordered = @c true and the result is |
|
| 72 |
* vectorised. @p window_forward is ignored (future rows cannot be causal |
|
| 73 |
* ground context). Returns a matrix of shape @c n_rows × n_codes². |
|
| 74 |
* |
|
| 75 |
* @param[in] codes Code matrix for one conversation (n_rows × n_codes). |
|
| 76 |
* @param[in] window_back Number of prior rows included in the window |
|
| 77 |
* (1 = current row only; INT_MAX = all prior rows). |
|
| 78 |
* @param[in] window_forward Number of future rows included in the window |
|
| 79 |
* (ignored when @p ordered is @c true). |
|
| 80 |
* @param[in] binary When @c true, clamp all positive entries to 1. |
|
| 81 |
* @param[in] ordered When @c true, use directed accumulation; when |
|
| 82 |
* @c false, use undirected stanza-window accumulation. |
|
| 83 |
* |
|
| 84 |
* @returns Connection matrix (n_rows × connection_vector_length) where |
|
| 85 |
* connection_vector_length is choose_two(n_codes) for undirected or |
|
| 86 |
* n_codes² for directed. |
|
| 87 |
* |
|
| 88 |
* @note Equivalent to @c ref_window_df() in @c rENA/ena.cpp (undirected case). |
|
| 89 |
*/ |
|
| 90 | 10x |
inline arma::mat accumulate_stanza( |
| 91 |
arma::mat codes, |
|
| 92 |
int window_back = 1, |
|
| 93 |
int window_forward = 0, |
|
| 94 |
bool binary = true, |
|
| 95 |
bool ordered = false |
|
| 96 |
) {
|
|
| 97 | 10x |
int n_rows = codes.n_rows; |
| 98 | 10x |
int n_codes = codes.n_cols; |
| 99 | 10x |
const int INT_MAX_VAL = std::numeric_limits<int>::max(); |
| 100 | ||
| 101 |
// Shared helper: earliest row index for focal row k |
|
| 102 | 33x |
auto get_earliest = [&](int row) -> int {
|
| 103 | 33x |
if (window_back == INT_MAX_VAL || window_back == 0) |
| 104 | ! |
return (window_back == 0) ? row : 0; |
| 105 | 33x |
return std::max(0, row - (window_back - 1)); |
| 106 |
}; |
|
| 107 | ||
| 108 | 10x |
if (ordered) {
|
| 109 | 5x |
arma::mat out(n_rows, n_codes * n_codes, arma::fill::zeros); |
| 110 | 20x |
for (int row = 0; row < n_rows; row++) {
|
| 111 | 15x |
int earliest = get_earliest(row); |
| 112 | 30x |
arma::rowvec response = codes.row(row); |
| 113 | 15x |
arma::rowvec ground(n_codes, arma::fill::zeros); |
| 114 | 15x |
if (row > earliest) |
| 115 | 20x |
ground = arma::sum(codes.rows(earliest, row - 1)); |
| 116 | 60x |
out.row(row) = arma::vectorise( |
| 117 | 45x |
connection_matrix(ground, response, 1.0, true)).t(); |
| 118 |
} |
|
| 119 | 8x |
if (binary) out.elem(arma::find(out > 0)).ones(); |
| 120 | 5x |
return out; |
| 121 |
} |
|
| 122 | ||
| 123 |
// Undirected: existing rENA stanza-window logic |
|
| 124 | 5x |
int n_tri = choose_two(n_codes); |
| 125 | 5x |
arma::mat out(n_rows, n_tri, arma::fill::zeros); |
| 126 | ||
| 127 | 23x |
for (int row = 0; row < n_rows; row++) {
|
| 128 | 18x |
int earliest = get_earliest(row); |
| 129 | ||
| 130 | 18x |
int latest = row; |
| 131 | 18x |
if (window_forward == INT_MAX_VAL || row + window_forward >= n_rows) {
|
| 132 | ! |
latest = n_rows - 1; |
| 133 | 18x |
} else if (window_forward > 0) {
|
| 134 | ! |
latest = std::min(n_rows - 1, row + window_forward); |
| 135 |
} |
|
| 136 | ||
| 137 | 36x |
arma::mat window_rows = codes.rows(earliest, latest); |
| 138 | 18x |
arma::mat summed = arma::sum(window_rows); |
| 139 | 18x |
arma::rowvec to_ut = code_connections(summed); |
| 140 | ||
| 141 |
// Back-reference correction |
|
| 142 | 18x |
int win_rows = latest - earliest + 1; |
| 143 | 18x |
if (win_rows > 0 && window_back > 1 && row - 1 >= 0) {
|
| 144 | 9x |
int head_rows = win_rows - 1 - window_forward; |
| 145 | 9x |
if (head_rows > 0) {
|
| 146 | 9x |
arma::mat refs = window_rows.head_rows(head_rows); |
| 147 | 9x |
arma::mat ref_sum = arma::sum(refs); |
| 148 | 9x |
to_ut -= code_connections(ref_sum); |
| 149 |
} |
|
| 150 |
} |
|
| 151 | ||
| 152 |
// Forward-reference correction |
|
| 153 |
if (window_forward > 0 && latest <= n_rows - 1) {
|
|
| 154 | ! |
int tail_rows = latest - row; |
| 155 | ! |
if (tail_rows > 0) {
|
| 156 | ! |
arma::mat refs = window_rows.tail_rows(tail_rows); |
| 157 | ! |
arma::mat ref_sum = arma::sum(refs); |
| 158 | ! |
to_ut -= code_connections(ref_sum); |
| 159 |
} |
|
| 160 |
} |
|
| 161 | ||
| 162 | 36x |
out.row(row) = to_ut; |
| 163 |
} |
|
| 164 | ||
| 165 | 8x |
if (binary) out.elem(arma::find(out > 0)).ones(); |
| 166 | 5x |
return out; |
| 167 |
} |
|
| 168 | ||
| 169 |
/// @} |
|
| 170 | ||
| 171 |
/// @name Ground/response accumulation (tma model) |
|
| 172 |
/// @{
|
|
| 173 | ||
| 174 |
/** @brief Accumulate connections for a single unit from its context rows. |
|
| 175 |
* |
|
| 176 |
* For every response row belonging to this unit, the function computes a |
|
| 177 |
* weighted ground vector from all preceding context rows and calls |
|
| 178 |
* connection_matrix(). |
|
| 179 |
* |
|
| 180 |
* @par Ordering semantics |
|
| 181 |
* When @p ordered is @c false (default), the result is folded into the |
|
| 182 |
* upper-triangle representation (length choose_two(p)). When @p ordered is |
|
| 183 |
* @c true, the full directed p² vector is returned. |
|
| 184 |
* |
|
| 185 |
* @param[in] codes Full context matrix visible to this unit (n × p). |
|
| 186 |
* @param[in] unit_rows 0-based indices of the rows that belong to this unit. |
|
| 187 |
* @param[in] decay_fn Callback with signature |
|
| 188 |
* @c arma::vec(arma::vec distances) that maps a vector |
|
| 189 |
* of distances (0 = current row, 1 = one step back, …) |
|
| 190 |
* to a vector of scalar weights; default behaviour is a |
|
| 191 |
* simple rectangular window. |
|
| 192 |
* @param[in] ordered When @c true, return a directed flat vector (length p²); |
|
| 193 |
* when @c false, return an undirected upper-triangle flat |
|
| 194 |
* vector (length choose_two(p)). |
|
| 195 |
* |
|
| 196 |
* @returns Flat connection vector of length choose_two(p) (undirected) or p² |
|
| 197 |
* (directed). |
|
| 198 |
* |
|
| 199 |
* @note Equivalent to @c accumulate_network() in @c tma/code.cpp, with the R |
|
| 200 |
* function callback replaced by @c std::function. |
|
| 201 |
*/ |
|
| 202 | 3x |
inline arma::rowvec accumulate_unit( |
| 203 |
const arma::mat& codes, |
|
| 204 |
const std::vector<int>& unit_rows, |
|
| 205 |
std::function<arma::vec(arma::vec)> decay_fn, |
|
| 206 |
bool ordered = false |
|
| 207 |
) {
|
|
| 208 | 3x |
int code_cnt = codes.n_cols; |
| 209 | 3x |
arma::mat g_w_vec(code_cnt, code_cnt, arma::fill::zeros); |
| 210 | ||
| 211 | 11x |
for (int unit_row : unit_rows) {
|
| 212 | 8x |
if (unit_row == 0) continue; |
| 213 | ||
| 214 | 14x |
arma::rowvec response = codes.row(unit_row); |
| 215 | ||
| 216 |
// Ground rows: everything from 0 up to (and including) unit_row |
|
| 217 | 7x |
arma::uvec ground_indices = arma::regspace<arma::uvec>(0, 1, unit_row); |
| 218 | 7x |
arma::vec distances(ground_indices.n_elem); |
| 219 | 32x |
for (arma::uword k = 0; k < ground_indices.n_elem; k++) |
| 220 | 75x |
distances[k] = static_cast<double>(unit_row - ground_indices[k]); |
| 221 | ||
| 222 | 7x |
arma::vec decay_weights = decay_fn(distances); |
| 223 | ||
| 224 | 7x |
arma::mat ground_codes = codes.rows(ground_indices); |
| 225 | 7x |
arma::mat weighted = ground_codes.each_col() % decay_weights; |
| 226 | 7x |
arma::rowvec g_summed = arma::sum(weighted); |
| 227 | ||
| 228 |
// Exclude the response row's own contribution from the ground |
|
| 229 | 21x |
arma::rowvec g_no_resp = g_summed - weighted.tail_rows(1).row(0); |
| 230 | ||
| 231 | 7x |
g_w_vec += connection_matrix(g_no_resp, response, 1.0, ordered); |
| 232 |
} |
|
| 233 | ||
| 234 | 3x |
if (!ordered) return fold_directed_network(arma::vectorise(g_w_vec)); |
| 235 | 2x |
return arma::vectorise(g_w_vec).t(); |
| 236 |
} |
|
| 237 | ||
| 238 |
/// @} |
|
| 239 | ||
| 240 |
/// @name Extended ground/response accumulation — returns per-row connection data |
|
| 241 |
/// @{
|
|
| 242 | ||
| 243 |
/** @brief Result of accumulate_unit_with_rows(). |
|
| 244 |
* |
|
| 245 |
* Bundles the unit-level flat connection vector together with the per-response-row |
|
| 246 |
* full p² connection matrices needed by tma's @c accumulate_network(). |
|
| 247 |
*/ |
|
| 248 |
struct UnitNetworks {
|
|
| 249 |
arma::rowvec networks; ///< Flat connection vector (choose_two(p) or p²). |
|
| 250 |
arma::mat row_networks; ///< Per-response-row full p² matrix (n_unit_rows × p²). |
|
| 251 |
}; |
|
| 252 | ||
| 253 |
/** @brief Like accumulate_unit() but also returns the per-response-row connection |
|
| 254 |
* matrix needed by tma's @c accumulate_network(). |
|
| 255 |
* |
|
| 256 |
* @param[in] codes Full context matrix visible to this unit (n × p). |
|
| 257 |
* @param[in] unit_rows 0-based indices of the rows that belong to this unit. |
|
| 258 |
* @param[in] decay_fn Callback with signature |
|
| 259 |
* @c arma::vec(int unit_row, arma::uvec ground_indices) |
|
| 260 |
* that returns a weight vector of length |
|
| 261 |
* @c ground_indices.n_elem. The two-argument form lets |
|
| 262 |
* callers (e.g. the tma Rcpp wrapper) set R environment |
|
| 263 |
* variables before invoking the actual R decay function, |
|
| 264 |
* without any R-specific code leaking into libqe. |
|
| 265 |
* @param[in] ordered When @c true, return directed flat vectors (length p²); |
|
| 266 |
* when @c false, return undirected upper-triangle flat |
|
| 267 |
* vectors (length choose_two(p)). |
|
| 268 |
* |
|
| 269 |
* @returns A UnitNetworks struct containing the aggregated connection vector |
|
| 270 |
* and the per-response-row connection matrices. |
|
| 271 |
* |
|
| 272 |
* @note Equivalent to @c accumulate_network() in @c tma/code.cpp (extended form). |
|
| 273 |
*/ |
|
| 274 | ! |
inline UnitNetworks accumulate_unit_with_rows( |
| 275 |
const arma::mat& codes, |
|
| 276 |
const std::vector<int>& unit_rows, |
|
| 277 |
std::function<arma::vec(int, arma::uvec)> decay_fn, |
|
| 278 |
bool ordered = false |
|
| 279 |
) {
|
|
| 280 | ! |
int code_cnt = codes.n_cols; |
| 281 | ! |
int n_unit_rows = unit_rows.size(); |
| 282 | ||
| 283 | ! |
arma::mat g_w_mat(code_cnt, code_cnt, arma::fill::zeros); |
| 284 | ! |
arma::mat row_networks(n_unit_rows, code_cnt * code_cnt, arma::fill::zeros); |
| 285 | ||
| 286 | ! |
for (int i = 0; i < n_unit_rows; ++i) {
|
| 287 | ! |
int unit_row = unit_rows[i]; |
| 288 | ||
| 289 | ! |
arma::uvec ground_indices = arma::regspace<arma::uvec>(0, 1, unit_row); |
| 290 | ! |
arma::vec decay_weights = decay_fn(unit_row, ground_indices); |
| 291 | ||
| 292 | ! |
arma::mat ground_codes = codes.rows(ground_indices); |
| 293 | ! |
arma::mat weighted = ground_codes.each_col() % decay_weights; |
| 294 | ! |
arma::rowvec g_summed = arma::sum(weighted); |
| 295 | ! |
arma::rowvec g_no_resp = g_summed - weighted.tail_rows(1).row(0); |
| 296 | ||
| 297 | ! |
arma::rowvec response = codes.row(unit_row); |
| 298 | ! |
arma::mat conn = connection_matrix(g_no_resp, response, 1.0, ordered); |
| 299 | ! |
g_w_mat += conn; |
| 300 | ! |
row_networks.row(i) = arma::vectorise(conn).t(); |
| 301 |
} |
|
| 302 | ||
| 303 | ! |
UnitNetworks result; |
| 304 | ! |
if (!ordered) |
| 305 | ! |
result.networks = fold_directed_network(arma::vectorise(g_w_mat)); |
| 306 |
else |
|
| 307 | ! |
result.networks = arma::vectorise(g_w_mat).t(); |
| 308 | ! |
result.row_networks = row_networks; |
| 309 | ! |
return result; |
| 310 |
} |
|
| 311 | ||
| 312 |
/// @} |
|
| 313 | ||
| 314 |
/// @name Tensor-based multi-modal accumulation (tma model) |
|
| 315 |
/// @{
|
|
| 316 | ||
| 317 |
/** @brief Compute the linear index into a column-major multi-dimensional array. |
|
| 318 |
* |
|
| 319 |
* @param[in] indices Per-dimension index values (0-based). |
|
| 320 |
* @param[in] dims Size of each dimension. |
|
| 321 |
* |
|
| 322 |
* @returns The scalar column-major flat index. |
|
| 323 |
* |
|
| 324 |
* @note Equivalent to @c flat_index() in @c tma/code.cpp. |
|
| 325 |
*/ |
|
| 326 | ! |
inline int flat_index(const std::vector<int>& indices, |
| 327 |
const std::vector<int>& dims) {
|
|
| 328 | ! |
if (indices.size() != dims.size()) |
| 329 | ! |
throw std::invalid_argument("Number of indices must match number of dimensions.");
|
| 330 | ! |
size_t linear = 0, stride = 1; |
| 331 | ! |
for (size_t v = 0; v < indices.size(); ++v) {
|
| 332 | ! |
linear += static_cast<size_t>(indices[v]) * stride; |
| 333 | ! |
stride *= static_cast<size_t>(dims[v]); |
| 334 |
} |
|
| 335 | ! |
return static_cast<int>(linear); |
| 336 |
} |
|
| 337 | ||
| 338 |
/** @brief Result of apply_tensor_unit(). |
|
| 339 |
* |
|
| 340 |
* Bundles the unit-level aggregated connection vector together with the |
|
| 341 |
* per-response-row connection matrices used by the tma tensor accumulation. |
|
| 342 |
*/ |
|
| 343 |
struct TensorNetworks {
|
|
| 344 |
arma::rowvec connection_counts; ///< Unit-level flat vector (p²). |
|
| 345 |
arma::mat row_connection_counts; ///< Per-response-row connection matrix (n_unit_rows × p²). |
|
| 346 |
}; |
|
| 347 | ||
| 348 |
/** @brief Pure-C++ port of the tma @c apply_tensor() inner logic. |
|
| 349 |
* |
|
| 350 |
* For each response row in @p unit_rows, the function looks up window and |
|
| 351 |
* weight values from the tensor, collects all ground rows that fall within the |
|
| 352 |
* response row's window, and calls connection_matrix() to accumulate the |
|
| 353 |
* connection counts. |
|
| 354 |
* |
|
| 355 |
* @param[in] tensor Flat column-major array containing window and |
|
| 356 |
* weight values indexed by the factor dimensions. |
|
| 357 |
* @param[in] dims Sizes of each dimension of @p tensor. |
|
| 358 |
* @param[in] dims_sender Tensor axis indices corresponding to sender factors. |
|
| 359 |
* @param[in] dims_receiver Tensor axis indices corresponding to receiver |
|
| 360 |
* factors; these axes are overridden to the response |
|
| 361 |
* row's values when looking up window sizes for |
|
| 362 |
* ground rows. |
|
| 363 |
* @param[in] dims_mode Tensor axis indices corresponding to mode factors. |
|
| 364 |
* @param[in] context_lookup Integer matrix (n_context_rows × n_factors, 0-based) |
|
| 365 |
* mapping each context row to its factor level indices. |
|
| 366 |
* @param[in] unit_rows 0-based response-row indices for this unit. |
|
| 367 |
* @param[in] codes Full context code matrix (n_context_rows × n_codes). |
|
| 368 |
* @param[in] times Timestamp associated with each context row |
|
| 369 |
* (length n_context_rows). |
|
| 370 |
* @param[in] ordered When @c true, produce a directed full p² result; |
|
| 371 |
* when @c false, produce an undirected upper-triangle |
|
| 372 |
* result. |
|
| 373 |
* |
|
| 374 |
* @returns A TensorNetworks struct containing the unit-level connection counts |
|
| 375 |
* and the per-response-row connection counts. |
|
| 376 |
* |
|
| 377 |
* @note Equivalent to the inner loop of @c apply_tensor() in @c tma/code.cpp. |
|
| 378 |
*/ |
|
| 379 | ! |
inline TensorNetworks apply_tensor_unit( |
| 380 |
const arma::vec& tensor, |
|
| 381 |
const std::vector<int>& dims, |
|
| 382 |
const std::vector<int>& dims_sender, |
|
| 383 |
const std::vector<int>& dims_receiver, |
|
| 384 |
const std::vector<int>& dims_mode, |
|
| 385 |
const arma::imat& context_lookup, |
|
| 386 |
const std::vector<int>& unit_rows, |
|
| 387 |
const arma::mat& codes, |
|
| 388 |
const arma::vec& times, |
|
| 389 |
bool ordered = true |
|
| 390 |
) {
|
|
| 391 | ! |
const int WINDOW_DIM = 1; |
| 392 | ! |
const int WEIGHT_DIM = 0; |
| 393 | ! |
const bool IS_DEFAULT = (dims.size() == 1 && dims[0] == 2); |
| 394 | ||
| 395 | ! |
int code_cnt = codes.n_cols; |
| 396 | ! |
int n_unit_rows = static_cast<int>(unit_rows.size()); |
| 397 | ! |
int ctx_cols = static_cast<int>(context_lookup.n_cols); |
| 398 | ||
| 399 | ! |
arma::mat g_w_mat(code_cnt, code_cnt, arma::fill::zeros); |
| 400 | ! |
arma::mat row_conn(n_unit_rows, code_cnt * code_cnt, arma::fill::zeros); |
| 401 | ||
| 402 | ! |
int response_win = 0; |
| 403 | ! |
int response_weight = 0; |
| 404 | ! |
if (IS_DEFAULT) {
|
| 405 | ! |
response_win = static_cast<int>(tensor[1]); |
| 406 | ! |
response_weight = static_cast<int>(tensor[0]); |
| 407 |
} |
|
| 408 | ||
| 409 | ! |
for (int i = 0; i < n_unit_rows; ++i) {
|
| 410 | ! |
int ri = unit_rows[i]; |
| 411 | ! |
double response_time = times[ri]; |
| 412 | ||
| 413 | ! |
std::vector<int> resp_ctx(ctx_cols + 1); |
| 414 | ! |
for (int j = 0; j < ctx_cols; ++j) resp_ctx[j] = context_lookup(ri, j); |
| 415 | ! |
resp_ctx[ctx_cols] = WINDOW_DIM; |
| 416 | ||
| 417 | ! |
if (!IS_DEFAULT) |
| 418 | ! |
response_win = static_cast<int>(tensor[flat_index(resp_ctx, dims)]); |
| 419 | ||
| 420 | ! |
std::vector<int> gri_v; |
| 421 | ! |
std::vector<double> grw_v; |
| 422 | ! |
arma::rowvec g_ws(code_cnt, arma::fill::zeros); |
| 423 | ||
| 424 | ! |
if (ri > 0) {
|
| 425 | ! |
for (int gr = ri - 1; gr >= 0; --gr) {
|
| 426 | ! |
std::vector<int> row_v(ctx_cols + 1); |
| 427 | ! |
for (int j = 0; j < ctx_cols; ++j) row_v[j] = context_lookup(gr, j); |
| 428 | ! |
row_v[ctx_cols] = WINDOW_DIM; |
| 429 | ! |
for (int dim : dims_receiver) row_v[dim] = resp_ctx[dim]; |
| 430 | ||
| 431 | ! |
double row_win = static_cast<double>(response_win); |
| 432 | ! |
if (!IS_DEFAULT) |
| 433 | ! |
row_win = tensor[flat_index(row_v, dims)]; |
| 434 | ||
| 435 | ! |
if (times[gr] + row_win > response_time) {
|
| 436 | ! |
gri_v.push_back(gr); |
| 437 | ! |
row_v[ctx_cols] = WEIGHT_DIM; |
| 438 | ! |
double row_wgt = static_cast<double>(response_weight); |
| 439 | ! |
if (!IS_DEFAULT) |
| 440 | ! |
row_wgt = tensor[flat_index(row_v, dims)]; |
| 441 | ! |
grw_v.push_back(row_wgt); |
| 442 |
} |
|
| 443 |
} |
|
| 444 | ||
| 445 | ! |
if (!gri_v.empty()) {
|
| 446 | ! |
arma::uvec gri_u(gri_v.size()); |
| 447 | ! |
for (size_t k = 0; k < gri_v.size(); ++k) gri_u[k] = gri_v[k]; |
| 448 | ! |
arma::mat gc = codes.rows(gri_u); |
| 449 | ! |
arma::colvec wts = arma::conv_to<arma::colvec>::from(grw_v); |
| 450 | ! |
g_ws = arma::sum(gc.each_col() % wts, 0); |
| 451 |
} |
|
| 452 |
} |
|
| 453 | ||
| 454 | ! |
if (!IS_DEFAULT) {
|
| 455 | ! |
resp_ctx[ctx_cols] = WEIGHT_DIM; |
| 456 | ! |
response_weight = static_cast<int>(tensor[flat_index(resp_ctx, dims)]); |
| 457 |
} |
|
| 458 | ||
| 459 | ! |
arma::rowvec row_vec = codes.row(ri); |
| 460 |
arma::mat resp = connection_matrix( |
|
| 461 | ! |
g_ws, row_vec, static_cast<double>(response_weight), ordered); |
| 462 | ! |
g_w_mat += resp; |
| 463 | ! |
row_conn.row(i) = arma::vectorise(resp).t(); |
| 464 |
} |
|
| 465 | ||
| 466 | ! |
TensorNetworks result; |
| 467 | ! |
result.connection_counts = arma::vectorise(g_w_mat).t(); |
| 468 | ! |
result.row_connection_counts = row_conn; |
| 469 | ! |
return result; |
| 470 |
} |
|
| 471 | ||
| 472 |
/// @} |
|
| 473 | ||
| 474 |
/// @name Per-row co-occurrence and rolling window (rENA accumulation primitives) |
|
| 475 |
/// @{
|
|
| 476 | ||
| 477 |
/** @brief Compute the upper-triangle co-occurrence vector for each row. |
|
| 478 |
* |
|
| 479 |
* For each row, calls code_connections() and optionally binarizes the result. |
|
| 480 |
* |
|
| 481 |
* @param[in] codes Code matrix (n_rows × n_codes). |
|
| 482 |
* @param[in] binary When @c true, clamp all positive entries to 1. |
|
| 483 |
* |
|
| 484 |
* @returns Matrix of shape n_rows × choose_two(n_codes). |
|
| 485 |
* |
|
| 486 |
* @note Equivalent to @c rows_to_co_occurrences() in @c rENA/ena.cpp. |
|
| 487 |
*/ |
|
| 488 | 5x |
inline arma::mat row_connections( |
| 489 |
const arma::mat& codes, |
|
| 490 |
bool binary = true |
|
| 491 |
) {
|
|
| 492 | 5x |
int n_rows = codes.n_rows; |
| 493 | 5x |
int n_tri = choose_two(codes.n_cols); |
| 494 | 5x |
arma::mat out(n_rows, n_tri, arma::fill::zeros); |
| 495 | 16x |
for (int row = 0; row < n_rows; ++row) |
| 496 | 33x |
out.row(row) = code_connections(codes.row(row)); |
| 497 | 8x |
if (binary) out.elem(arma::find(out > 0)).ones(); |
| 498 | 5x |
return out; |
| 499 |
} |
|
| 500 | ||
| 501 |
/** @brief Rolling backward window sum of the raw code matrix. |
|
| 502 |
* |
|
| 503 |
* For each focal row @c k, sums rows [@c max(0, k - window_size + 1), @c k]. |
|
| 504 |
* Returns a matrix of the same shape as @p codes — no upper-triangle transform |
|
| 505 |
* is applied. A @p window_size of 0 or less is treated as 1 (current row only). |
|
| 506 |
* |
|
| 507 |
* @param[in] codes Code matrix (n_rows × n_codes). |
|
| 508 |
* @param[in] window_size Number of rows to include in the backward window |
|
| 509 |
* (including the focal row). Values <= 0 are clamped |
|
| 510 |
* to 1. |
|
| 511 |
* |
|
| 512 |
* @returns Matrix of the same shape as @p codes containing the windowed sums. |
|
| 513 |
* |
|
| 514 |
* @note Equivalent to @c ref_window_lag() in @c rENA/ena.cpp. |
|
| 515 |
*/ |
|
| 516 | 4x |
inline arma::mat rolling_window_sum( |
| 517 |
const arma::mat& codes, |
|
| 518 |
int window_size = 1 |
|
| 519 |
) {
|
|
| 520 | 4x |
int n_rows = codes.n_rows; |
| 521 | 4x |
arma::mat out(n_rows, codes.n_cols, arma::fill::zeros); |
| 522 | 16x |
for (int row = 0; row < n_rows; ++row) {
|
| 523 |
int earliest = (window_size > 0) |
|
| 524 |
? std::max(0, row - (window_size - 1)) |
|
| 525 | ! |
: row; // guard: window_size <= 0 -> single row |
| 526 | 36x |
out.row(row) = arma::sum(codes.rows(earliest, row)); |
| 527 |
} |
|
| 528 | 4x |
return out; |
| 529 |
} |
|
| 530 | ||
| 531 |
/// @} |
|
| 532 | ||
| 533 |
} // namespace qe |
|
| 534 | ||
| 535 |
#endif // LIBQE_ACCUMULATION_HPP |
| 1 |
/** |
|
| 2 |
* @file adjacency.hpp |
|
| 3 |
* @brief Combinatorics and vector/matrix utilities for ENA connection networks. |
|
| 4 |
* |
|
| 5 |
* Pure C++ / Armadillo — no Rcpp dependency. |
|
| 6 |
* Include @c <RcppArmadillo.h> before this header when building inside an R package. |
|
| 7 |
*/ |
|
| 8 |
#ifndef LIBQE_ADJACENCY_HPP |
|
| 9 |
#define LIBQE_ADJACENCY_HPP |
|
| 10 | ||
| 11 |
#include <armadillo> |
|
| 12 |
#include <string> |
|
| 13 |
#include <vector> |
|
| 14 |
#include <cmath> |
|
| 15 | ||
| 16 |
namespace qe {
|
|
| 17 | ||
| 18 |
// --------------------------------------------------------------------------- |
|
| 19 |
/// @name Combinatorics |
|
| 20 |
/// @{
|
|
| 21 |
// --------------------------------------------------------------------------- |
|
| 22 | ||
| 23 |
/** |
|
| 24 |
* @brief Number of unordered pairs from @p n items (n choose 2). |
|
| 25 |
* |
|
| 26 |
* @param n Number of codes/nodes. |
|
| 27 |
* @returns @c (n * (n - 1)) / 2. |
|
| 28 |
*/ |
|
| 29 | 67x |
inline int choose_two(int n) {
|
| 30 | 67x |
return (n * (n - 1)) / 2; |
| 31 |
} |
|
| 32 | ||
| 33 |
/** |
|
| 34 |
* @brief Upper-triangle index pairs for a symmetric matrix of side @p len. |
|
| 35 |
* |
|
| 36 |
* @param len Side length of the square matrix (number of codes). |
|
| 37 |
* @param row Controls what is returned: |
|
| 38 |
* - @c -1 (default): 2 × k matrix of [row_idx; col_idx]. |
|
| 39 |
* - @c 0: row indices only (1 × k). |
|
| 40 |
* - @c 1: column indices only (1 × k). |
|
| 41 |
* @returns An @c arma::umat of shape 2×k or 1×k depending on @p row. |
|
| 42 |
* @note Equivalent to @c triIndices() in rENA/ena.cpp and tma/code.cpp. |
|
| 43 |
*/ |
|
| 44 | 9x |
inline arma::umat connection_indices(int len, int row = -1) {
|
| 45 | 9x |
int vS = choose_two(len); |
| 46 | 9x |
int s = 0; |
| 47 | ||
| 48 | 9x |
arma::umat vR(2, vS, arma::fill::zeros); |
| 49 | 9x |
arma::umat vRone(1, vS, arma::fill::zeros); |
| 50 | ||
| 51 | 45x |
for (int i = 2; i <= len; i++) {
|
| 52 | 145x |
for (int j = 0; j < i - 1; j++) {
|
| 53 | 109x |
vR(0, s) = j; |
| 54 | 109x |
vR(1, s) = i - 1; |
| 55 | 115x |
if (row == 0) vRone[s] = j; |
| 56 | 109x |
else if (row == 1) vRone[s] = i - 1; |
| 57 | 109x |
s++; |
| 58 |
} |
|
| 59 |
} |
|
| 60 | 18x |
return (row == -1) ? vR : vRone; |
| 61 |
} |
|
| 62 | ||
| 63 |
/// @} |
|
| 64 | ||
| 65 |
// --------------------------------------------------------------------------- |
|
| 66 |
/// @name Vector ↔ upper-triangle conversions |
|
| 67 |
/// @{
|
|
| 68 |
// --------------------------------------------------------------------------- |
|
| 69 | ||
| 70 |
/** |
|
| 71 |
* @brief Compute pairwise products and return as a flat upper-triangle vector. |
|
| 72 |
* |
|
| 73 |
* For each pair @c (j, i) with @c j < i, outputs @c v[j] * v[i]. |
|
| 74 |
* The result has length @c choose_two(v.size()). |
|
| 75 |
* |
|
| 76 |
* @param v A 1 × p row vector (or p-element matrix). |
|
| 77 |
* @returns A row vector of length @c choose_two(p). |
|
| 78 |
* @note Equivalent to @c vector_to_ut() in rENA/ena.cpp. |
|
| 79 |
*/ |
|
| 80 | 40x |
inline arma::rowvec code_connections(arma::mat v) {
|
| 81 | 40x |
int vL = v.size(); |
| 82 | 40x |
int vS = choose_two(vL); |
| 83 | 40x |
arma::rowvec out(vS, arma::fill::zeros); |
| 84 | 40x |
int s = 0; |
| 85 | 130x |
for (int i = 2; i <= vL; i++) {
|
| 86 | 240x |
for (int j = 0; j < i - 1; j++) {
|
| 87 | 450x |
out[s] = v[j] * v[i - 1]; |
| 88 | 150x |
s++; |
| 89 |
} |
|
| 90 |
} |
|
| 91 | 40x |
return out; |
| 92 |
} |
|
| 93 | ||
| 94 |
/** |
|
| 95 |
* @brief Fold a directed n² vector into an undirected upper-triangle vector. |
|
| 96 |
* |
|
| 97 |
* Symmetric elements are summed: A→B + B→A for every pair. |
|
| 98 |
* |
|
| 99 |
* @param v Flat directed vector of length n², column-major (n = sqrt(v.size())). |
|
| 100 |
* @returns Upper-triangle row vector of length @c choose_two(n). |
|
| 101 |
* @note Equivalent to @c vector_to_summed_uppertri() in tma/code.cpp. |
|
| 102 |
*/ |
|
| 103 | 6x |
inline arma::rowvec fold_directed_network(arma::vec v) {
|
| 104 | 6x |
int n = static_cast<int>(std::round(std::sqrt(static_cast<double>(v.size())))); |
| 105 | 6x |
int tris = choose_two(n); |
| 106 | ||
| 107 | 6x |
arma::mat m(v); |
| 108 | 6x |
m.reshape(n, n); |
| 109 | 12x |
m = m + m.t(); |
| 110 | ||
| 111 | 6x |
arma::colvec flat = m.as_col(); |
| 112 | 6x |
arma::uvec up_inds = arma::trimatu_ind(arma::size(m), 1); |
| 113 | 12x |
return arma::conv_to<arma::rowvec>::from(flat.elem(up_inds)); |
| 114 |
} |
|
| 115 | ||
| 116 |
/** |
|
| 117 |
* @brief Flatten an adjacency matrix to a vector. |
|
| 118 |
* |
|
| 119 |
* @param x Square adjacency matrix (n × n). |
|
| 120 |
* @param full If @c true (default), returns the full n² vector (directed). |
|
| 121 |
* If @c false, returns the upper-triangle only (undirected). |
|
| 122 |
* @returns Row vector of length n² or @c choose_two(n). |
|
| 123 |
* @note Equivalent to @c adjacency_matrix_to_vector() in tma/code.cpp. |
|
| 124 |
*/ |
|
| 125 | 2x |
inline arma::rowvec network_to_vector(arma::mat x, bool full = true) {
|
| 126 | 3x |
if (full) return arma::vectorise(x).t(); |
| 127 | 2x |
arma::mat combined = arma::trimatu(x, 1) + arma::trimatl(x, -1); |
| 128 | 1x |
arma::uvec up_inds = arma::trimatu_ind(arma::size(combined), 1); |
| 129 | 2x |
return arma::vectorise(combined(up_inds)).t(); |
| 130 |
} |
|
| 131 | ||
| 132 |
/// @} |
|
| 133 | ||
| 134 |
// --------------------------------------------------------------------------- |
|
| 135 |
/// @name String utilities |
|
| 136 |
/// @{
|
|
| 137 |
// --------------------------------------------------------------------------- |
|
| 138 | ||
| 139 |
/** |
|
| 140 |
* @brief Generate "A & B" pair names for every upper-triangle position. |
|
| 141 |
* |
|
| 142 |
* For code names @c {"A", "B", "C"} returns @c {"A & B", "A & C", "B & C"}.
|
|
| 143 |
* |
|
| 144 |
* @param v Ordered list of code names (length p). |
|
| 145 |
* @returns Vector of @c choose_two(p) label strings. |
|
| 146 |
* @note Equivalent to @c svector_to_ut() in rENA/ena.cpp. |
|
| 147 |
*/ |
|
| 148 | 2x |
inline std::vector<std::string> connection_names(std::vector<std::string> v) {
|
| 149 | 2x |
int vL = v.size(); |
| 150 | 2x |
int vS = choose_two(vL); |
| 151 | 2x |
std::vector<std::string> out(vS); |
| 152 | 2x |
int s = 0; |
| 153 | 7x |
for (int i = 2; i <= vL; i++) {
|
| 154 | 14x |
for (int j = 0; j < i - 1; j++) {
|
| 155 | 9x |
out[s] = v[j] + " & " + v[i - 1]; |
| 156 | 9x |
s++; |
| 157 |
} |
|
| 158 |
} |
|
| 159 | 2x |
return out; |
| 160 |
} |
|
| 161 | ||
| 162 |
/// @} |
|
| 163 | ||
| 164 |
} // namespace qe |
|
| 165 | ||
| 166 |
#endif // LIBQE_ADJACENCY_HPP |
| 1 |
/** |
|
| 2 |
* @file generalized_rotation.hpp |
|
| 3 |
* @brief Generalized Means Rotation (GMR) with Lasso-based covariate adjustment. |
|
| 4 |
* |
|
| 5 |
* @details |
|
| 6 |
* Mirrors rENA's `ena.rotate.by.generalized()` + `gmr()` + `get_x1_main_effect()`. |
|
| 7 |
* Authoritative reference: commit 46776a1981a90b3a3b2861ed1010e9dbb7acf901 |
|
| 8 |
* ("Fixed bugs in rotation functions") by Zhiqiang Cai.
|
|
| 9 |
* |
|
| 10 |
* **Algorithm overview** |
|
| 11 |
* |
|
| 12 |
* 1. Compute x_vector: the rotation axis that best represents the target |
|
| 13 |
* variable's contribution to the ENA point space, after controlling for |
|
| 14 |
* covariates via Lasso (`lasso_x1_contribution`). |
|
| 15 |
* 2. Deflate V by x_vector → defA = V - V*x*x'. |
|
| 16 |
* 3. Optionally compute secondary axis x1: the normalised group-mean |
|
| 17 |
* difference in the *original* V space (not defA) for groups 0 vs 1. |
|
| 18 |
* x1 stays empty when the target is not categorical or there are not |
|
| 19 |
* exactly 2 groups — there is no SVD fallback. |
|
| 20 |
* 4. Compute y_vector from a GMR call on defA, or from SVD of defA. |
|
| 21 |
* Explicitly orthogonalise y_vector against x_vector, and against x1 |
|
| 22 |
* when x1 is available. |
|
| 23 |
* 5. Double-deflate V by x_vector and y_vector, then call `complete_rotation()` |
|
| 24 |
* to fill remaining dimensions from the SVD of the doubly-deflated space. |
|
| 25 |
* |
|
| 26 |
* **Caller responsibilities** |
|
| 27 |
* |
|
| 28 |
* - Build the model matrix (`X_raw`) from the data frame, including dummy |
|
| 29 |
* variables for categorical predictors and any desired interaction terms. |
|
| 30 |
* No formula parsing is done here. |
|
| 31 |
* - Identify `x1_cols`: the 0-based column indices in `X_raw` that correspond to |
|
| 32 |
* the target variable (these are unpenalized in the Lasso). |
|
| 33 |
* - Encode categorical targets as 0-based integer codes in `x_target`. |
|
| 34 |
* - Set `x_n_groups` to the number of distinct groups when `x_categorical=true`. |
|
| 35 |
* |
|
| 36 |
* The R wrapper (`rENA::ena.rotate.by.generalized`) handles these steps and |
|
| 37 |
* calls `libqe::generalized_means_rotation()` with the prepared matrices. |
|
| 38 |
*/ |
|
| 39 | ||
| 40 |
#ifndef LIBQE_GENERALIZED_ROTATION_HPP |
|
| 41 |
#define LIBQE_GENERALIZED_ROTATION_HPP |
|
| 42 | ||
| 43 |
#include "linalg_fallback.hpp" // qe::linalg::eig_sym (works w/o LAPACK) |
|
| 44 |
#include "rotation.hpp" // ena_svd, complete_rotation, RotationResult |
|
| 45 |
#include "lasso.hpp" // lasso_x1_contribution |
|
| 46 | ||
| 47 |
#include <armadillo> |
|
| 48 |
#include <stdexcept> |
|
| 49 |
#include <string> |
|
| 50 |
#include <vector> |
|
| 51 | ||
| 52 |
namespace qe {
|
|
| 53 | ||
| 54 |
/// @name Between-group scatter |
|
| 55 |
/// @{
|
|
| 56 | ||
| 57 |
/** |
|
| 58 |
* @brief Compute the between-group scatter matrix S_B. |
|
| 59 |
* |
|
| 60 |
* @details |
|
| 61 |
* Computes S_B = Σ_g n_g * (μ_g − μ)(μ_g − μ)' where μ is the grand mean. |
|
| 62 |
* |
|
| 63 |
* @note Mirrors rENA's `compute_SB()` (commit 46776a1981a90b3a3b2861ed1010e9dbb7acf901). |
|
| 64 |
* |
|
| 65 |
* @param[in] V n×p matrix of ENA points (or covariate-adjusted contribution). |
|
| 66 |
* @param[in] labels n×1 0-based integer group labels. |
|
| 67 |
* @param[in] n_groups Number of distinct groups. |
|
| 68 |
* |
|
| 69 |
* @returns p×p symmetric between-group scatter matrix. |
|
| 70 |
*/ |
|
| 71 | 4x |
inline arma::mat between_group_scatter( |
| 72 |
const arma::mat& V, |
|
| 73 |
const arma::uvec& labels, |
|
| 74 |
arma::uword n_groups |
|
| 75 |
) {
|
|
| 76 | 4x |
const arma::uword p = V.n_cols; |
| 77 | 4x |
arma::rowvec mu = arma::mean(V, 0); |
| 78 | 4x |
arma::mat SB(p, p, arma::fill::zeros); |
| 79 | ||
| 80 | 12x |
for (arma::uword g = 0; g < n_groups; ++g) {
|
| 81 | 8x |
arma::uvec idx = arma::find(labels == g); |
| 82 | 8x |
if (idx.is_empty()) continue; |
| 83 | 21x |
arma::vec diff = (arma::mean(V.rows(idx), 0) - mu).t(); |
| 84 | 14x |
SB += static_cast<double>(idx.n_elem) * diff * diff.t(); |
| 85 |
} |
|
| 86 | 8x |
return SB; |
| 87 |
} |
|
| 88 | ||
| 89 |
/// @} |
|
| 90 | ||
| 91 |
/// @name Rotation direction |
|
| 92 |
/// @{
|
|
| 93 | ||
| 94 |
/** |
|
| 95 |
* @brief Compute the unit rotation direction from a covariate-adjusted contribution matrix. |
|
| 96 |
* |
|
| 97 |
* @details |
|
| 98 |
* **Numeric target:** |
|
| 99 |
* Centres the target, then computes β = (t_c' t_c)^{-1} t_c' Vx (OLS slope from
|
|
| 100 |
* lm(Vx ~ target)). Mirrors R: `model = lm(Vx ~ target); beta = coefficients[2,]`. |
|
| 101 |
* |
|
| 102 |
* **Categorical target:** |
|
| 103 |
* Computes the between-group scatter matrix S_B and returns its leading |
|
| 104 |
* eigenvector (the direction of maximum between-group variance). |
|
| 105 |
* Mirrors R: `sb = compute_SB(Vx, target); r = svd(sb)$v[,1]`. |
|
| 106 |
* |
|
| 107 |
* @note Mirrors rENA's `gmr()` direction computation |
|
| 108 |
* (commit 46776a1981a90b3a3b2861ed1010e9dbb7acf901). |
|
| 109 |
* |
|
| 110 |
* @param[in] Vx n×q covariate-adjusted contribution matrix. |
|
| 111 |
* @param[in] target n×1 target variable (numeric float, or 0-based integer codes). |
|
| 112 |
* @param[in] categorical `true` if the target is categorical (uses between-group scatter); |
|
| 113 |
* `false` for a numeric target (uses OLS slope). |
|
| 114 |
* @param[in] n_groups Number of distinct groups; only used when `categorical` is `true`. |
|
| 115 |
* |
|
| 116 |
* @returns Unit vector (p×1) representing the rotation direction. |
|
| 117 |
* |
|
| 118 |
* @throws std::runtime_error If `categorical` is `true` and `n_groups < 2`. |
|
| 119 |
* @throws std::runtime_error If the numeric target has zero variance after centering. |
|
| 120 |
* @warning Throws `std::runtime_error` if the resulting direction has zero norm, |
|
| 121 |
* which can occur when the Lasso-adjusted contribution collapses to zero. |
|
| 122 |
*/ |
|
| 123 | 15x |
inline arma::vec gmr_direction( |
| 124 |
const arma::mat& Vx, // n×q covariate-adjusted contribution |
|
| 125 |
const arma::vec& target, // n×1 (numeric float, or 0-based int codes) |
|
| 126 |
bool categorical, |
|
| 127 |
arma::uword n_groups = 0 |
|
| 128 |
) {
|
|
| 129 | 15x |
arma::vec r; |
| 130 | ||
| 131 | 15x |
if (categorical) {
|
| 132 | 4x |
if (n_groups < 2) |
| 133 |
throw std::runtime_error( |
|
| 134 | ! |
"gmr_direction: x_n_groups must be >= 2 for categorical target"); |
| 135 | 4x |
arma::uvec labels = arma::conv_to<arma::uvec>::from(target); |
| 136 | 4x |
arma::mat SB = between_group_scatter(Vx, labels, n_groups); |
| 137 | ||
| 138 | 4x |
arma::vec eigval; arma::mat eigvec; |
| 139 | 4x |
qe::linalg::eig_sym(eigval, eigvec, SB); // ascending order |
| 140 | 8x |
r = eigvec.col(eigvec.n_cols - 1); // largest eigenvalue → last col |
| 141 |
} else {
|
|
| 142 |
// Centre target to get the OLS slope (mirrors R's lm() intercept handling) |
|
| 143 | 22x |
arma::vec tc = target - arma::mean(target); |
| 144 | 11x |
double tTt = arma::dot(tc, tc); |
| 145 | 11x |
if (tTt < 1e-12) |
| 146 |
throw std::runtime_error( |
|
| 147 | ! |
"gmr_direction: target has zero variance after centering"); |
| 148 | 11x |
arma::rowvec beta = (tc.t() * Vx) / tTt; // 1×q slope vector |
| 149 | 11x |
r = beta.t(); |
| 150 |
} |
|
| 151 | ||
| 152 | 15x |
double norm = arma::norm(r); |
| 153 | 15x |
if (norm < 1e-12) |
| 154 | ! |
throw std::runtime_error("gmr_direction: resulting direction has zero norm");
|
| 155 | 30x |
return r / norm; |
| 156 |
} |
|
| 157 | ||
| 158 |
/// @} |
|
| 159 | ||
| 160 |
/** |
|
| 161 |
* @brief Result of one GMR axis step. |
|
| 162 |
*/ |
|
| 163 |
struct GMRStepResult {
|
|
| 164 |
arma::vec direction; ///< Unit vector in connection space (p×1). |
|
| 165 |
}; |
|
| 166 | ||
| 167 |
/// @name GMR axis step |
|
| 168 |
/// @{
|
|
| 169 | ||
| 170 |
/** |
|
| 171 |
* @brief Compute the rotation direction for one axis of the Generalized Means Rotation. |
|
| 172 |
* |
|
| 173 |
* @details |
|
| 174 |
* Applies optional row subsetting, performs Lasso-based covariate adjustment |
|
| 175 |
* (or a simple OLS projection when all columns are target columns), then |
|
| 176 |
* delegates to `gmr_direction()` to produce a unit direction vector. |
|
| 177 |
* |
|
| 178 |
* When `x1_cols.n_elem >= p` (all columns are target columns and there are no |
|
| 179 |
* penalized covariates), the function falls back to a simple OLS projection: |
|
| 180 |
* Vx = fitted values from lm(V ~ t). |
|
| 181 |
* |
|
| 182 |
* @note Mirrors rENA's `gmr()` step logic |
|
| 183 |
* (commit 46776a1981a90b3a3b2861ed1010e9dbb7acf901). |
|
| 184 |
* |
|
| 185 |
* @param[in] V n×q matrix of ENA points (full data; subset applied internally). |
|
| 186 |
* @param[in] X_model n×p model matrix (caller-built; target cols + covariate cols). |
|
| 187 |
* @param[in] target n×1 target variable (raw float or 0-based integer codes). |
|
| 188 |
* @param[in] x1_cols 0-based indices of target columns in `X_model` (unpenalized in Lasso). |
|
| 189 |
* @param[in] categorical `true` if the target is categorical. |
|
| 190 |
* @param[in] n_groups Number of distinct groups (only used when `categorical` is `true`). |
|
| 191 |
* @param[in] subset Optional 0-based row indices to use; empty means use all rows. |
|
| 192 |
* @param[in] n_lambda Length of the Lasso lambda path (forwarded to `lasso_x1_contribution`). |
|
| 193 |
* @param[in] k_folds Number of cross-validation folds (forwarded to `lasso_x1_contribution`). |
|
| 194 |
* @param[in] lasso_eps lambda_min = lasso_eps * lambda_max (forwarded to `lasso_x1_contribution`). |
|
| 195 |
* |
|
| 196 |
* @returns `GMRStepResult` containing the unit direction vector. |
|
| 197 |
* |
|
| 198 |
* @throws std::runtime_error Propagated from `gmr_direction()` if the direction |
|
| 199 |
* is degenerate or the target has zero variance. |
|
| 200 |
*/ |
|
| 201 | 15x |
inline GMRStepResult gmr_step( |
| 202 |
const arma::mat& V, |
|
| 203 |
const arma::mat& X_model, |
|
| 204 |
const arma::vec& target, |
|
| 205 |
const arma::uvec& x1_cols, |
|
| 206 |
bool categorical, |
|
| 207 |
arma::uword n_groups, |
|
| 208 |
const arma::uvec& subset, |
|
| 209 |
int n_lambda = 50, |
|
| 210 |
int k_folds = 5, |
|
| 211 |
double lasso_eps = 0.01 |
|
| 212 |
) {
|
|
| 213 | 15x |
const arma::uword n = V.n_rows; |
| 214 | 15x |
const arma::uword p = X_model.n_cols; |
| 215 | ||
| 216 |
// ── Select rows ────────────────────────────────────────────────────────── |
|
| 217 | 15x |
arma::uvec rows = subset.is_empty() |
| 218 | 15x |
? arma::regspace<arma::uvec>(0, n - 1) |
| 219 | 15x |
: subset; |
| 220 | ||
| 221 | 15x |
const arma::mat V_sub = V.rows(rows); |
| 222 | 15x |
const arma::mat X_sub = X_model.rows(rows); |
| 223 | 30x |
const arma::vec t_sub = target.rows(rows); |
| 224 | ||
| 225 |
// ── Lasso-adjusted contribution Vx ─────────────────────────────────────── |
|
| 226 | 15x |
arma::mat Vx_sub; |
| 227 | 15x |
if (x1_cols.n_elem >= p) {
|
| 228 |
// All columns are target columns (no penalized covariates): |
|
| 229 |
// fall back to simple OLS projection (Vx = fitted values from lm(V~t)) |
|
| 230 | 30x |
arma::vec tc = t_sub - arma::mean(t_sub); |
| 231 | 15x |
double tTt = arma::dot(tc, tc); |
| 232 | 15x |
if (tTt > 1e-12) {
|
| 233 | 14x |
arma::rowvec b1 = (tc.t() * V_sub) / tTt; |
| 234 | 14x |
Vx_sub = tc * b1; |
| 235 |
} else {
|
|
| 236 | 1x |
Vx_sub.zeros(V_sub.n_rows, V_sub.n_cols); |
| 237 |
} |
|
| 238 |
} else {
|
|
| 239 | ! |
arma::vec pf(p, arma::fill::ones); |
| 240 | ! |
for (arma::uword j : x1_cols) pf(j) = 0.0; |
| 241 | ||
| 242 | ! |
Vx_sub = lasso_x1_contribution( |
| 243 | ! |
X_sub, V_sub, x1_cols, pf, n_lambda, k_folds, lasso_eps); |
| 244 |
} |
|
| 245 | ||
| 246 |
// ── Rotation direction ─────────────────────────────────────────────────── |
|
| 247 | 15x |
arma::vec direction = gmr_direction(Vx_sub, t_sub, categorical, n_groups); |
| 248 | ||
| 249 | 30x |
return {direction};
|
| 250 |
} |
|
| 251 | ||
| 252 |
/// @} |
|
| 253 | ||
| 254 |
/** |
|
| 255 |
* @brief Parameters for `generalized_means_rotation()`. |
|
| 256 |
* |
|
| 257 |
* @details |
|
| 258 |
* Groups all inputs needed to specify both the X and Y rotation axes, along |
|
| 259 |
* with Lasso tuning knobs. The Y axis is optional: when `has_y` is `false` |
|
| 260 |
* the second axis is taken from the leading SVD of the x-deflated space. |
|
| 261 |
*/ |
|
| 262 |
struct GeneralizedRotationParams {
|
|
| 263 |
/// @name X axis (required) |
|
| 264 |
/// @{
|
|
| 265 |
arma::mat x_model_matrix; ///< n×p model matrix for the X axis. |
|
| 266 |
arma::vec x_target; ///< n×1 target variable (raw float or 0-based integer codes). |
|
| 267 |
arma::uvec x1_cols; ///< 0-based target column indices in `x_model_matrix`. |
|
| 268 |
bool x_categorical = false; ///< `true` if the X target is categorical. |
|
| 269 |
arma::uword x_n_groups = 0; ///< Number of distinct groups (used when `x_categorical` is `true`). |
|
| 270 |
arma::uvec x_subset; ///< Optional row subset (0-based); empty means use all rows. |
|
| 271 |
/// @} |
|
| 272 | ||
| 273 |
/// @name Y axis (optional) |
|
| 274 |
/// @{
|
|
| 275 |
/// When `has_y` is `false`, the Y axis is taken from the leading SVD of the x-deflated space. |
|
| 276 |
bool has_y = false; ///< `true` if a Y-axis GMR target is provided. |
|
| 277 |
arma::mat y_model_matrix; ///< n×p model matrix for the Y axis. |
|
| 278 |
arma::vec y_target; ///< n×1 target variable for the Y axis. |
|
| 279 |
arma::uvec y1_cols; ///< 0-based target column indices in `y_model_matrix`. |
|
| 280 |
bool y_categorical = false; ///< `true` if the Y target is categorical. |
|
| 281 |
arma::uword y_n_groups = 0; ///< Number of distinct groups for the Y axis. |
|
| 282 |
/// The Y axis always uses all rows (no subset parameter). |
|
| 283 |
/// @} |
|
| 284 | ||
| 285 |
/// @name Lasso tuning |
|
| 286 |
/// @{
|
|
| 287 |
int n_lambda = 50; ///< Length of the lambda path. |
|
| 288 |
int k_folds = 5; ///< Number of cross-validation folds. |
|
| 289 |
double lasso_eps = 0.01; ///< lambda_min = lasso_eps * lambda_max. |
|
| 290 |
/// @} |
|
| 291 |
}; |
|
| 292 | ||
| 293 |
/// @name Generalized Means Rotation |
|
| 294 |
/// @{
|
|
| 295 | ||
| 296 |
/** |
|
| 297 |
* @brief Full Generalized Means Rotation (GMR) producing a complete q×q rotation matrix. |
|
| 298 |
* |
|
| 299 |
* @details |
|
| 300 |
* C++/Armadillo implementation of `ena.rotate.by.generalized()`, matching the |
|
| 301 |
* authoritative R implementation in commit 46776a1981a90b3a3b2861ed1010e9dbb7acf901 |
|
| 302 |
* by Zhiqiang Cai. |
|
| 303 |
* |
|
| 304 |
* **Steps performed:** |
|
| 305 |
* |
|
| 306 |
* 1. **X axis via GMR** — calls `gmr_step()` on V with the X-axis parameters. |
|
| 307 |
* 2. **Deflate V** — computes defA = V − V·x·x' for subsequent Y-axis work. |
|
| 308 |
* 3. **Secondary axis x1** — when the X target is categorical with ≥ 2 groups, |
|
| 309 |
* computes x1 = normalize(mean(V[group0]) − mean(V[group1])) using the |
|
| 310 |
* *original* V (not defA) and the full-length target (all n rows). |
|
| 311 |
* x1 is used only to orthogonalise y_vector; it does not deflate defA. |
|
| 312 |
* There is no SVD fallback: x1 stays empty if the condition is not met. |
|
| 313 |
* 4. **Y axis** — either GMR on defA (when `params.has_y` is `true`) or the |
|
| 314 |
* leading SVD direction of defA. |
|
| 315 |
* 5. **Orthogonalise y_vector** — mirrors R: |
|
| 316 |
* `y_vector <- y_vector - (t(y_vector) %*% x_vector) * x_vector` |
|
| 317 |
* and, when x1 is available, |
|
| 318 |
* `y_vector <- y_vector - (t(y_vector) %*% x1) * x1`, |
|
| 319 |
* followed by safe normalisation. |
|
| 320 |
* 6. **Complete with SVD** — double-deflates V by both x_vector and y_vector, |
|
| 321 |
* then calls `complete_rotation()` to fill remaining dimensions. |
|
| 322 |
* |
|
| 323 |
* @note Mirrors rENA's `ena.rotate.by.generalized()` |
|
| 324 |
* (commit 46776a1981a90b3a3b2861ed1010e9dbb7acf901). |
|
| 325 |
* |
|
| 326 |
* @param[in] V n×q matrix of ENA points (line weights; caller normalises/ |
|
| 327 |
* centres upstream). |
|
| 328 |
* @param[in] params Fully populated `GeneralizedRotationParams` struct. |
|
| 329 |
* |
|
| 330 |
* @returns `RotationResult` with: |
|
| 331 |
* - `rotation` q×q full rotation matrix (column j = axis j). |
|
| 332 |
* - `eigenvalues` q×1 variance per SVD-filled axis (named axes carry 0). |
|
| 333 |
* - `column_names` "GMR1", "GMR2" or "SVD2", "SVD3", …, "SVD{q}".
|
|
| 334 |
* |
|
| 335 |
* @throws std::runtime_error If `gmr_direction()` produces a degenerate direction |
|
| 336 |
* for the X or Y axis. |
|
| 337 |
* @throws std::runtime_error If y_vector has zero norm after orthogonalisation |
|
| 338 |
* (can happen when defA is rank-deficient or perfectly aligned with x_vector). |
|
| 339 |
*/ |
|
| 340 | 13x |
inline RotationResult generalized_means_rotation( |
| 341 |
const arma::mat& V, |
|
| 342 |
const GeneralizedRotationParams& params |
|
| 343 |
) {
|
|
| 344 |
// ── Step 1: X axis via GMR ──────────────────────────────────────────── |
|
| 345 |
GMRStepResult x_gmr = gmr_step( |
|
| 346 |
V, |
|
| 347 | 13x |
params.x_model_matrix, |
| 348 | 13x |
params.x_target, |
| 349 | 13x |
params.x1_cols, |
| 350 | 13x |
params.x_categorical, |
| 351 | 13x |
params.x_n_groups, |
| 352 | 13x |
params.x_subset, |
| 353 | 13x |
params.n_lambda, |
| 354 | 13x |
params.k_folds, |
| 355 | 13x |
params.lasso_eps); |
| 356 | ||
| 357 | 13x |
const arma::vec x_vector = x_gmr.direction; |
| 358 | ||
| 359 |
// ── Step 2: Deflate V by x_vector ───────────────────────────────────── |
|
| 360 |
// defA is used only for y_vector computation; x1 uses original V. |
|
| 361 | 26x |
const arma::mat defA = V - V * x_vector * x_vector.t(); |
| 362 | ||
| 363 |
// ── Step 3: Secondary axis x1 ───────────────────────────────────────── |
|
| 364 |
// Mirrors R: x1 = normalize(mean(V[group0]) - mean(V[group1])). |
|
| 365 |
// |
|
| 366 |
// Key points matching the authoritative R: |
|
| 367 |
// - Uses original V (not defA) |
|
| 368 |
// - Uses the full-length target (all n rows), not just the subset |
|
| 369 |
// - Groups are always 0 (first) and 1 (second) in 0-based encoding |
|
| 370 |
// - NO fallback: x1 stays empty when categorical/2-group condition fails |
|
| 371 |
// - x1 does NOT deflate defA; it is only used to orthogonalise y_vector |
|
| 372 | 13x |
arma::vec x1; |
| 373 | ||
| 374 | 13x |
if (params.x_categorical && params.x_n_groups >= 2) {
|
| 375 | 4x |
arma::uvec labels = arma::conv_to<arma::uvec>::from(params.x_target); |
| 376 | 4x |
arma::uvec g0 = arma::find(labels == 0); |
| 377 | 4x |
arma::uvec g1 = arma::find(labels == 1); |
| 378 | ||
| 379 |
if (!g0.is_empty() && !g1.is_empty()) {
|
|
| 380 | 4x |
arma::rowvec m0 = arma::mean(V.rows(g0), 0); // original V, not defA |
| 381 | 4x |
arma::rowvec m1 = arma::mean(V.rows(g1), 0); |
| 382 | 4x |
arma::vec diff = (m0 - m1).t(); |
| 383 | 4x |
double dlen = arma::norm(diff); |
| 384 | 8x |
if (dlen > 1e-10) x1 = diff / dlen; |
| 385 |
} |
|
| 386 |
} |
|
| 387 | ||
| 388 |
// ── Step 4: Y axis ──────────────────────────────────────────────────── |
|
| 389 |
// GMR on defA (deflated by x_vector only), or SVD of defA. |
|
| 390 | 13x |
arma::vec y_vector; |
| 391 | 13x |
std::string y_name = "SVD2"; |
| 392 | ||
| 393 | 13x |
if (params.has_y) {
|
| 394 |
GMRStepResult y_gmr = gmr_step( |
|
| 395 |
defA, |
|
| 396 | 2x |
params.y_model_matrix, |
| 397 | 2x |
params.y_target, |
| 398 | 2x |
params.y1_cols, |
| 399 | 2x |
params.y_categorical, |
| 400 | 2x |
params.y_n_groups, |
| 401 |
{}, // no row subset for y axis
|
|
| 402 | 2x |
params.n_lambda, |
| 403 | 2x |
params.k_folds, |
| 404 | 2x |
params.lasso_eps); |
| 405 | 2x |
y_vector = y_gmr.direction; |
| 406 | 2x |
y_name = "GMR2"; |
| 407 |
} else {
|
|
| 408 | 11x |
RotationResult svd_r = ena_svd(defA); |
| 409 | 11x |
y_vector = svd_r.rotation.col(0); |
| 410 |
} |
|
| 411 | ||
| 412 |
// ── Step 5: Orthogonalise y_vector ──────────────────────────────────── |
|
| 413 |
// Mirrors R (authoritative commit): |
|
| 414 |
// y_vector <- y_vector - (t(y_vector) %*% x_vector) * x_vector |
|
| 415 |
// if (!is.null(x1)) y_vector <- y_vector - (t(y_vector) %*% x1) * x1 |
|
| 416 |
// y_vector <- safe_normalize(y_vector) |
|
| 417 | 26x |
y_vector -= arma::dot(y_vector, x_vector) * x_vector; |
| 418 | 13x |
if (!x1.is_empty()) |
| 419 | 8x |
y_vector -= arma::dot(y_vector, x1) * x1; |
| 420 | ||
| 421 | 13x |
double yn = arma::norm(y_vector); |
| 422 | 13x |
if (yn < 1e-12) |
| 423 |
throw std::runtime_error( |
|
| 424 | ! |
"generalized_means_rotation: y_vector has zero norm after orthogonalization"); |
| 425 | 13x |
y_vector /= yn; |
| 426 | ||
| 427 |
// ── Step 6: Combine and complete with SVD ───────────────────────────── |
|
| 428 |
// Double-deflate the original data (mirrors R): |
|
| 429 |
// defA <- A - A %*% x %*% t(x) - A %*% y %*% t(y) |
|
| 430 |
// then pass to prcomp (equivalent: SVD of defA_final). |
|
| 431 |
arma::mat defA_final = V |
|
| 432 | 13x |
- V * x_vector * x_vector.t() |
| 433 | 26x |
- V * y_vector * y_vector.t(); |
| 434 | ||
| 435 | 13x |
arma::mat named_axes = arma::join_rows(x_vector, y_vector); |
| 436 | 65x |
return complete_rotation(defA_final, named_axes, {"GMR1", y_name});
|
| 437 |
} |
|
| 438 | ||
| 439 |
/// @} |
|
| 440 | ||
| 441 |
} // namespace qe |
|
| 442 |
#endif // LIBQE_GENERALIZED_ROTATION_HPP |
| 1 |
/** |
|
| 2 |
* @file lasso.hpp |
|
| 3 |
* @brief Coordinate-descent Lasso for multivariate response with k-fold CV. |
|
| 4 |
* |
|
| 5 |
* Mirrors `glmnet(x, y, family = "mgaussian", penalty.factor = ...)` with |
|
| 6 |
* k-fold cross-validation to select lambda, and warm-starts along the lambda |
|
| 7 |
* path for efficiency. |
|
| 8 |
* |
|
| 9 |
* The public entry point is qe::lasso_x1_contribution(), which returns an |
|
| 10 |
* n×q matrix of fitted values attributable to the "target" predictors at the |
|
| 11 |
* CV-selected lambda. It is used by `generalized_means_rotation` to isolate |
|
| 12 |
* the target variable's contribution to the ENA point space after controlling |
|
| 13 |
* for covariates. |
|
| 14 |
* |
|
| 15 |
* **Penalty factors (`pf`):** |
|
| 16 |
* - `0.0` — predictor is always included (forced in; no shrinkage) |
|
| 17 |
* - `1.0` — standard L1 penalty |
|
| 18 |
* |
|
| 19 |
* The caller is responsible for building the model matrix (main effects and/or |
|
| 20 |
* interactions); this function accepts a numeric matrix, not a formula. |
|
| 21 |
*/ |
|
| 22 | ||
| 23 |
#ifndef LIBQE_LASSO_HPP |
|
| 24 |
#define LIBQE_LASSO_HPP |
|
| 25 | ||
| 26 |
#include <armadillo> |
|
| 27 |
#include <cmath> |
|
| 28 |
#include <limits> |
|
| 29 |
#include <stdexcept> |
|
| 30 | ||
| 31 |
namespace qe {
|
|
| 32 | ||
| 33 |
/** |
|
| 34 |
* @brief Soft-threshold (shrinkage) operator used by coordinate descent. |
|
| 35 |
* |
|
| 36 |
* @details Computes the scalar soft-threshold function: |
|
| 37 |
* \f$ S(z,\gamma) = \operatorname{sign}(z)\,\max(|z|-\gamma,\,0) \f$
|
|
| 38 |
* |
|
| 39 |
* @param[in] z Input value. |
|
| 40 |
* @param[in] gamma Threshold amount (must be non-negative for meaningful use). |
|
| 41 |
* @returns Soft-thresholded value; exactly zero when `|z| <= gamma`. |
|
| 42 |
*/ |
|
| 43 | ! |
inline double soft_threshold(double z, double gamma) noexcept {
|
| 44 | ! |
return z > gamma ? z - gamma : z < -gamma ? z + gamma : 0.0; |
| 45 |
} |
|
| 46 | ||
| 47 |
/** |
|
| 48 |
* @brief Single-response Lasso via cyclic coordinate descent at a fixed lambda. |
|
| 49 |
* |
|
| 50 |
* @details Fits a Lasso regression for a single response vector using cyclic |
|
| 51 |
* coordinate descent (naive updates). The partial-residual update for |
|
| 52 |
* predictor \f$j\f$ is: |
|
| 53 |
* \f[ |
|
| 54 |
* z_j = \frac{X_j^\top r}{n} + \frac{X_j^\top X_j}{n}\,\beta_j
|
|
| 55 |
* \f] |
|
| 56 |
* \f[ |
|
| 57 |
* \beta_j \leftarrow \frac{S\!\left(z_j,\;\lambda\,\mathit{pf}_j\right)}{X_j^\top X_j / n}
|
|
| 58 |
* \f] |
|
| 59 |
* where \f$r = y - X\beta\f$ is the residual maintained incrementally. |
|
| 60 |
* Predictors with \f$\mathit{pf}_j = 0\f$ skip the soft-threshold and receive
|
|
| 61 |
* the unconstrained OLS update instead. |
|
| 62 |
* |
|
| 63 |
* @param[in] X n×p column-standardized predictor matrix. |
|
| 64 |
* @param[in] y n×1 response vector (centering recommended but not required). |
|
| 65 |
* @param[in] lambda Regularization strength, on the standardized predictor scale. |
|
| 66 |
* @param[in] pf p×1 penalty factors (0 = unpenalized / forced in, 1 = standard L1). |
|
| 67 |
* @param[in] beta0 Warm-start coefficient vector; pass an empty `arma::vec{}`
|
|
| 68 |
* for a cold start from zero. |
|
| 69 |
* @param[in] max_iter Maximum number of coordinate-descent passes (default 1000). |
|
| 70 |
* @param[in] tol Convergence tolerance on the maximum coefficient change |
|
| 71 |
* per pass (default 1e-7). |
|
| 72 |
* @returns p×1 coefficient vector on the standardized predictor scale. |
|
| 73 |
*/ |
|
| 74 | ! |
inline arma::vec lasso_cd( |
| 75 |
const arma::mat& X, |
|
| 76 |
const arma::vec& y, |
|
| 77 |
double lambda, |
|
| 78 |
const arma::vec& pf, |
|
| 79 |
arma::vec beta0 = {},
|
|
| 80 |
int max_iter = 1000, |
|
| 81 |
double tol = 1e-7 |
|
| 82 |
) {
|
|
| 83 | ! |
const arma::uword p = X.n_cols; |
| 84 | ! |
const double n = static_cast<double>(X.n_rows); |
| 85 | ||
| 86 |
// Precompute X_j'X_j / n for each predictor |
|
| 87 | ! |
arma::vec xTx(p); |
| 88 | ! |
for (arma::uword j = 0; j < p; ++j) |
| 89 | ! |
xTx(j) = arma::dot(X.col(j), X.col(j)) / n; |
| 90 | ||
| 91 | ! |
arma::vec beta = beta0.is_empty() |
| 92 | ! |
? arma::vec(p, arma::fill::zeros) |
| 93 | ! |
: beta0; |
| 94 | ! |
arma::vec r = y - X * beta; // residual, maintained incrementally |
| 95 | ||
| 96 | ! |
for (int it = 0; it < max_iter; ++it) {
|
| 97 | ! |
double max_delta = 0.0; |
| 98 | ||
| 99 | ! |
for (arma::uword j = 0; j < p; ++j) {
|
| 100 | ! |
if (xTx(j) < 1e-14) continue; // skip zero-variance predictor |
| 101 | ||
| 102 |
// Partial correlation statistic |
|
| 103 | ! |
double z = arma::dot(X.col(j), r) / n + xTx(j) * beta(j); |
| 104 | ||
| 105 | ! |
double b_new = (pf(j) < 1e-12) |
| 106 | ! |
? z / xTx(j) // forced in: OLS |
| 107 | ! |
: soft_threshold(z, lambda * pf(j)) / xTx(j); // L1 |
| 108 | ||
| 109 | ! |
double delta = b_new - beta(j); |
| 110 | ! |
if (std::abs(delta) > max_delta) max_delta = std::abs(delta); |
| 111 | ! |
r -= X.col(j) * delta; |
| 112 | ! |
beta(j) = b_new; |
| 113 |
} |
|
| 114 | ||
| 115 | ! |
if (max_delta < tol) break; |
| 116 |
} |
|
| 117 | ! |
return beta; |
| 118 |
} |
|
| 119 | ||
| 120 |
/** |
|
| 121 |
* @brief Compute the smallest lambda that drives all penalized coefficients to zero. |
|
| 122 |
* |
|
| 123 |
* @details For column-standardized `X` and column-centered `Yc`, lambda_max is: |
|
| 124 |
* \f[ |
|
| 125 |
* \lambda_{\max} = \max_{\substack{j:\,\mathit{pf}_j > 0 \\ k}} \frac{|X_j^\top Y_{c,k}|}{n}
|
|
| 126 |
* \f] |
|
| 127 |
* Above this value every penalized coefficient is exactly zero (unpenalized |
|
| 128 |
* predictors are unaffected and remain in via OLS regardless of lambda). |
|
| 129 |
* |
|
| 130 |
* @param[in] X n×p column-standardized predictor matrix. |
|
| 131 |
* @param[in] Yc n×q column-centered response matrix. |
|
| 132 |
* @param[in] pf p×1 penalty factors; predictors with `pf(j) < 1e-12` are skipped. |
|
| 133 |
* @returns Scalar lambda_max value. |
|
| 134 |
*/ |
|
| 135 | ! |
inline double lasso_lambda_max( |
| 136 |
const arma::mat& X, // column-standardized |
|
| 137 |
const arma::mat& Yc, // column-centered responses |
|
| 138 |
const arma::vec& pf |
|
| 139 |
) {
|
|
| 140 | ! |
const double n = static_cast<double>(X.n_rows); |
| 141 | ! |
double lmax = 0.0; |
| 142 | ! |
for (arma::uword j = 0; j < X.n_cols; ++j) {
|
| 143 | ! |
if (pf(j) < 1e-12) continue; // skip unpenalized predictors |
| 144 | ! |
for (arma::uword k = 0; k < Yc.n_cols; ++k) {
|
| 145 | ! |
double v = std::abs(arma::dot(X.col(j), Yc.col(k))) / n; |
| 146 | ! |
if (v > lmax) lmax = v; |
| 147 |
} |
|
| 148 |
} |
|
| 149 | ! |
return lmax; |
| 150 |
} |
|
| 151 | ||
| 152 |
/** |
|
| 153 |
* @brief Cross-validated multivariate Lasso; returns fitted values for target columns only. |
|
| 154 |
* |
|
| 155 |
* @details Fits a multivariate Lasso (one independent single-response problem per |
|
| 156 |
* response column) along a log-spaced lambda path from lambda_max down to |
|
| 157 |
* `eps * lambda_max`. Lambda is selected by k-fold cross-validation |
|
| 158 |
* (mean squared error, averaged over folds and response columns), and the |
|
| 159 |
* model is refit on the full data at the selected lambda using warm-starts |
|
| 160 |
* along the path. |
|
| 161 |
* |
|
| 162 |
* **Preprocessing performed internally:** |
|
| 163 |
* - `X_raw` is column-standardized to zero mean and unit variance (zero-variance |
|
| 164 |
* columns are left as-is to avoid division by zero). |
|
| 165 |
* - `Y` is column-centered to absorb the intercept. |
|
| 166 |
* - Coefficients are unstandardized before the contribution is computed. |
|
| 167 |
* |
|
| 168 |
* **Fold assignment:** row \f$i\f$ is assigned to fold \f$i \bmod k\f$ |
|
| 169 |
* (deterministic, order-dependent). |
|
| 170 |
* |
|
| 171 |
* **Return value:** The n×q matrix \f$X_{\text{raw},x_1} \hat{B}_{x_1}\f$
|
|
| 172 |
* represents the portion of `Y` (mean-removed) that is linearly explained by |
|
| 173 |
* the target predictors after penalizing covariate coefficients. A zero |
|
| 174 |
* matrix is returned when the target predictors carry no signal at any lambda |
|
| 175 |
* (e.g., the target is constant after subsetting). Constant shifts cancel |
|
| 176 |
* in the downstream between-group scatter (categorical) and slope regression |
|
| 177 |
* (numeric), so no `ymu` adjustment is applied to the return value. |
|
| 178 |
* |
|
| 179 |
* @param[in] X_raw n×p raw model matrix built by the caller (main effects |
|
| 180 |
* and/or interactions); no formula parsing is done here. |
|
| 181 |
* @param[in] Y n×q ENA point matrix (multivariate response). |
|
| 182 |
* @param[in] x1_cols 0-based column indices in `X_raw` that belong to the |
|
| 183 |
* target variable; these receive `pf = 0` (always included). |
|
| 184 |
* @param[in] pf p×1 penalty factors: 0 for `x1_cols`, 1 elsewhere. |
|
| 185 |
* The caller constructs this; non-standard weightings are |
|
| 186 |
* supported. |
|
| 187 |
* @param[in] n_lambda Length of the lambda path (default 50). |
|
| 188 |
* @param[in] k_folds Number of cross-validation folds (default 5; clamped to |
|
| 189 |
* `[2, n/2]`). |
|
| 190 |
* @param[in] eps Ratio `lambda_min / lambda_max`; controls how far down |
|
| 191 |
* the path to search (default 0.01). |
|
| 192 |
* @returns n×q matrix of fitted values attributable to `x1_cols` at the |
|
| 193 |
* CV-selected lambda (lambda.min), on the original `Y` scale. |
|
| 194 |
*/ |
|
| 195 | ! |
inline arma::mat lasso_x1_contribution( |
| 196 |
const arma::mat& X_raw, |
|
| 197 |
const arma::mat& Y, |
|
| 198 |
const arma::uvec& x1_cols, |
|
| 199 |
const arma::vec& pf, |
|
| 200 |
int n_lambda = 50, |
|
| 201 |
int k_folds = 5, |
|
| 202 |
double eps = 0.01 |
|
| 203 |
) {
|
|
| 204 | ! |
const arma::uword n = X_raw.n_rows; |
| 205 | ! |
const arma::uword p = X_raw.n_cols; |
| 206 | ! |
const arma::uword q = Y.n_cols; |
| 207 | ||
| 208 |
// Guard: need at least 2 folds and 2 observations per fold |
|
| 209 | ! |
k_folds = std::max(2, std::min(k_folds, static_cast<int>(n) / 2)); |
| 210 | ||
| 211 |
// ── Standardize X (μ=0, σ=1); guard zero-variance predictors ──────────── |
|
| 212 | ! |
arma::rowvec xmu = arma::mean(X_raw, 0); |
| 213 | ! |
arma::rowvec xstd = arma::stddev(X_raw, 0, 0); |
| 214 | ! |
xstd.elem(arma::find(xstd < 1e-12)).ones(); // avoid divide-by-zero |
| 215 | ! |
arma::mat X = (X_raw.each_row() - xmu).each_row() / xstd; |
| 216 | ||
| 217 |
// ── Center Y (absorbs intercept into the standardized-scale fit) ───────── |
|
| 218 | ! |
arma::rowvec ymu = arma::mean(Y, 0); |
| 219 | ! |
arma::mat Yc = Y.each_row() - ymu; |
| 220 | ||
| 221 |
// ── Build lambda path: lambda_max → lambda_min on log scale ───────────── |
|
| 222 | ! |
double lmax = lasso_lambda_max(X, Yc, pf); |
| 223 | ! |
if (lmax < 1e-12) // target uncorrelated with Y after standardization |
| 224 | ! |
return arma::mat(n, q, arma::fill::zeros); |
| 225 | ||
| 226 | ! |
arma::vec lambdas = arma::exp( |
| 227 | ! |
arma::linspace(std::log(lmax), |
| 228 |
std::log(eps * lmax), |
|
| 229 | ! |
n_lambda)); |
| 230 | ||
| 231 |
// ── Stratified k-fold CV (fold assignment: row i → fold i % k_folds) ──── |
|
| 232 | ! |
arma::uvec fold_id(n); |
| 233 | ! |
for (arma::uword i = 0; i < n; ++i) |
| 234 | ! |
fold_id(i) = i % static_cast<arma::uword>(k_folds); |
| 235 | ||
| 236 | ! |
arma::vec cv_err(n_lambda, arma::fill::zeros); |
| 237 | ||
| 238 | ! |
for (int fold = 0; fold < k_folds; ++fold) {
|
| 239 | ! |
arma::uvec tr = arma::find(fold_id != static_cast<arma::uword>(fold)); |
| 240 | ! |
arma::uvec va = arma::find(fold_id == static_cast<arma::uword>(fold)); |
| 241 | ||
| 242 | ! |
const arma::mat X_tr = X.rows(tr), X_va = X.rows(va); |
| 243 | ! |
const arma::mat Y_tr = Yc.rows(tr), Y_va = Yc.rows(va); |
| 244 | ! |
const double n_va_q = static_cast<double>(va.n_elem * q); |
| 245 | ||
| 246 |
// Warm-start along path for this fold (start cold at lambda_max) |
|
| 247 | ! |
arma::mat bw(p, q, arma::fill::zeros); |
| 248 | ! |
for (int l = 0; l < n_lambda; ++l) {
|
| 249 | ! |
arma::mat b(p, q); |
| 250 | ! |
for (arma::uword k = 0; k < q; ++k) |
| 251 | ! |
b.col(k) = lasso_cd(X_tr, Y_tr.col(k), lambdas(l), pf, bw.col(k)); |
| 252 | ! |
bw = b; |
| 253 | ||
| 254 | ! |
arma::mat err = Y_va - X_va * b; |
| 255 | ! |
cv_err(l) += arma::accu(err % err) / n_va_q; |
| 256 |
} |
|
| 257 |
} |
|
| 258 | ! |
cv_err /= static_cast<double>(k_folds); |
| 259 | ||
| 260 |
// ── Refit on full data at lambda_min, warming along the path ───────────── |
|
| 261 | ! |
arma::uword best = cv_err.index_min(); |
| 262 | ! |
arma::mat bfull(p, q, arma::fill::zeros); |
| 263 | ! |
for (arma::uword l = 0; l <= best; ++l) {
|
| 264 | ! |
arma::mat b(p, q); |
| 265 | ! |
for (arma::uword k = 0; k < q; ++k) |
| 266 | ! |
b.col(k) = lasso_cd(X, Yc.col(k), lambdas(l), pf, bfull.col(k)); |
| 267 | ! |
bfull = b; |
| 268 |
} |
|
| 269 | ||
| 270 |
// ── Unstandardize: beta_orig[j] = beta_std[j] / xstd[j] ───────────────── |
|
| 271 | ! |
for (arma::uword j = 0; j < p; ++j) |
| 272 | ! |
bfull.row(j) /= xstd(j); |
| 273 | ||
| 274 |
// ── Return fitted contribution from x1 columns only ────────────────────── |
|
| 275 |
// This is the portion of Y (mean-removed) linearly explained by the target |
|
| 276 |
// predictors after covariate adjustment. Constant shifts cancel in both |
|
| 277 |
// the between-group scatter (categorical) and slope regression (numeric) |
|
| 278 |
// that follow, so no ymu adjustment is needed here. |
|
| 279 | ! |
return X_raw.cols(x1_cols) * bfull.rows(x1_cols); |
| 280 |
} |
|
| 281 | ||
| 282 |
} // namespace qe |
|
| 283 |
#endif // LIBQE_LASSO_HPP |
| 1 |
/** @file linalg_fallback.hpp |
|
| 2 |
* @brief Thin LAPACK-free implementations of SVD, QR, and symmetric |
|
| 3 |
* eigendecomposition for use in WASM / no-LAPACK builds. |
|
| 4 |
* |
|
| 5 |
* When @c ARMA_USE_LAPACK is defined these wrappers delegate directly to the |
|
| 6 |
* standard Armadillo routines (@c arma::svd, @c arma::qr, @c arma::eig_sym), |
|
| 7 |
* so there is zero overhead in normal builds. When LAPACK is not available |
|
| 8 |
* (e.g. Emscripten WASM builds) the wrappers fall back to pure C++ |
|
| 9 |
* implementations that work for the small matrices typical in ENA |
|
| 10 |
* (p ≤ ~100 connection dimensions). |
|
| 11 |
* |
|
| 12 |
* All routines live in namespace @c qe::linalg and operate on @c arma::mat |
|
| 13 |
* (double precision). |
|
| 14 |
* |
|
| 15 |
* Fallback algorithms: |
|
| 16 |
* - **SVD** — one-sided Jacobi iterations on \f$A^T\f$ (Demmel & Veselić 1992) |
|
| 17 |
* - **QR** — modified Gram-Schmidt |
|
| 18 |
* - **eig_sym** — cyclic Jacobi (classical Jacobi sweep) |
|
| 19 |
*/ |
|
| 20 | ||
| 21 |
#ifndef LIBQE_LINALG_FALLBACK_HPP |
|
| 22 |
#define LIBQE_LINALG_FALLBACK_HPP |
|
| 23 | ||
| 24 |
#include <armadillo> |
|
| 25 |
#include <cmath> |
|
| 26 |
#include <limits> |
|
| 27 | ||
| 28 |
namespace qe {
|
|
| 29 |
namespace linalg {
|
|
| 30 | ||
| 31 |
/// @cond INTERNAL |
|
| 32 |
// ── helpers ─────────────────────────────────────────────────────────────────── |
|
| 33 | ||
| 34 |
namespace detail {
|
|
| 35 | ||
| 36 |
/// @brief Modified Gram-Schmidt orthonormalization applied in-place. |
|
| 37 |
/// |
|
| 38 |
/// Columns of @p Q are successively orthonormalized against all previously |
|
| 39 |
/// processed columns. A column whose norm falls below 1e-14 after projection |
|
| 40 |
/// is left unchanged (not zeroed) to avoid introducing NaNs. |
|
| 41 |
/// |
|
| 42 |
/// @param Q Matrix whose columns are orthonormalized in-place (modified). |
|
| 43 |
inline void mgs(arma::mat& Q) {
|
|
| 44 |
const arma::uword p = Q.n_cols; |
|
| 45 |
for (arma::uword j = 0; j < p; ++j) {
|
|
| 46 |
for (arma::uword i = 0; i < j; ++i) |
|
| 47 |
Q.col(j) -= arma::dot(Q.col(i), Q.col(j)) * Q.col(i); |
|
| 48 |
double n = arma::norm(Q.col(j)); |
|
| 49 |
if (n > 1e-14) Q.col(j) /= n; |
|
| 50 |
} |
|
| 51 |
} |
|
| 52 | ||
| 53 |
/// @brief One cyclic Jacobi sweep on a p×p symmetric matrix. |
|
| 54 |
/// |
|
| 55 |
/// Iterates over all super-diagonal index pairs @c (q,r) and applies a |
|
| 56 |
/// two-sided Jacobi rotation that annihilates @c A(q,r). The same rotation |
|
| 57 |
/// is accumulated in @p V so that, starting with @c V = I, the full sequence |
|
| 58 |
/// of sweeps yields @c V = eigenvector matrix. |
|
| 59 |
/// |
|
| 60 |
/// @param A Symmetric p×p matrix, updated in-place toward diagonal form. |
|
| 61 |
/// @param V Rotation accumulator (p×p); initialize to identity to recover |
|
| 62 |
/// eigenvectors. |
|
| 63 |
/// |
|
| 64 |
/// @note Algorithm follows the classical cyclic-by-rows Jacobi scheme. |
|
| 65 |
/// Off-diagonal entries no larger than 1e-14 × (|a_qq| + |a_rr|) are |
|
| 66 |
/// skipped for numerical stability. The `<=` (not `<`) guard correctly |
|
| 67 |
/// handles the all-zero case (a_qq = a_rr = a_qr = 0) that would |
|
| 68 |
/// otherwise produce NaN via 0/0 in the theta computation. |
|
| 69 |
inline void jacobi_sweep(arma::mat& A, arma::mat& V) {
|
|
| 70 |
const arma::uword p = A.n_rows; |
|
| 71 |
for (arma::uword q = 0; q < p - 1; ++q) {
|
|
| 72 |
for (arma::uword r = q + 1; r < p; ++r) {
|
|
| 73 |
double aqq = A(q, q), arr = A(r, r), aqr = A(q, r); |
|
| 74 |
if (std::abs(aqr) <= 1e-14 * (std::abs(aqq) + std::abs(arr))) |
|
| 75 |
continue; |
|
| 76 |
double theta = (arr - aqq) / (2.0 * aqr); |
|
| 77 |
double t = (theta >= 0.0) |
|
| 78 |
? 1.0 / (theta + std::sqrt(1.0 + theta * theta)) |
|
| 79 |
: 1.0 / (theta - std::sqrt(1.0 + theta * theta)); |
|
| 80 |
double c = 1.0 / std::sqrt(1.0 + t * t); ///< cosine of Jacobi rotation angle |
|
| 81 |
double s = t * c; ///< sine of Jacobi rotation angle |
|
| 82 |
// Apply Jacobi rotation to A (symmetric update) |
|
| 83 |
double new_qq = aqq - t * aqr; |
|
| 84 |
double new_rr = arr + t * aqr; |
|
| 85 |
A(q, q) = new_qq; A(r, r) = new_rr; A(q, r) = A(r, q) = 0.0; |
|
| 86 |
for (arma::uword k = 0; k < p; ++k) {
|
|
| 87 |
if (k == q || k == r) continue; |
|
| 88 |
double akq = A(k, q), akr = A(k, r); |
|
| 89 |
A(k, q) = A(q, k) = c * akq - s * akr; |
|
| 90 |
A(k, r) = A(r, k) = s * akq + c * akr; |
|
| 91 |
} |
|
| 92 |
// Accumulate rotation in V |
|
| 93 |
for (arma::uword k = 0; k < p; ++k) {
|
|
| 94 |
double vkq = V(k, q), vkr = V(k, r); |
|
| 95 |
V(k, q) = c * vkq - s * vkr; |
|
| 96 |
V(k, r) = s * vkq + c * vkr; |
|
| 97 |
} |
|
| 98 |
} |
|
| 99 |
} |
|
| 100 |
} |
|
| 101 | ||
| 102 |
} // namespace detail |
|
| 103 | ||
| 104 |
/// @endcond |
|
| 105 | ||
| 106 |
/// @name Eigendecomposition |
|
| 107 |
/// @{
|
|
| 108 | ||
| 109 |
// ── eig_sym ─────────────────────────────────────────────────────────────────── |
|
| 110 | ||
| 111 |
/// @brief Eigendecomposition of a real symmetric matrix. |
|
| 112 |
/// |
|
| 113 |
/// Computes eigenvalues and eigenvectors of the symmetric p×p matrix @p S and |
|
| 114 |
/// returns them sorted in **ascending** eigenvalue order. |
|
| 115 |
/// |
|
| 116 |
/// @param[out] eigval p×1 vector of eigenvalues, ascending. |
|
| 117 |
/// @param[out] eigvec p×p matrix whose columns are the corresponding |
|
| 118 |
/// orthonormal eigenvectors. |
|
| 119 |
/// @param[in] S Real symmetric p×p input matrix (not modified). |
|
| 120 |
/// |
|
| 121 |
/// @warning With @c ARMA_USE_LAPACK defined this calls @c arma::eig_sym |
|
| 122 |
/// directly. Without LAPACK the fallback runs up to @c 50*p cyclic |
|
| 123 |
/// Jacobi sweeps; convergence is guaranteed for well-conditioned |
|
| 124 |
/// matrices but may be slow for p > 200. |
|
| 125 |
/// |
|
| 126 |
/// @note Fallback algorithm: classical cyclic-by-rows Jacobi iteration. |
|
| 127 |
/// Convergence is declared when the Frobenius norm of the strictly |
|
| 128 |
/// off-diagonal part drops below 1e-13. |
|
| 129 | 4x |
inline void eig_sym(arma::vec& eigval, arma::mat& eigvec, const arma::mat& S) {
|
| 130 |
#if defined(ARMA_USE_LAPACK) |
|
| 131 | 4x |
arma::eig_sym(eigval, eigvec, S); |
| 132 |
#else |
|
| 133 |
const arma::uword p = S.n_rows; |
|
| 134 |
arma::mat A = S; ///< working copy of S |
|
| 135 |
eigvec.eye(p, p); ///< accumulate rotations; starts as identity |
|
| 136 | ||
| 137 |
for (int sweep = 0; sweep < 50 * static_cast<int>(p); ++sweep) {
|
|
| 138 |
// Check off-diagonal convergence |
|
| 139 |
double off = 0.0; |
|
| 140 |
for (arma::uword q = 0; q < p - 1; ++q) |
|
| 141 |
for (arma::uword r = q + 1; r < p; ++r) |
|
| 142 |
off += A(q, r) * A(q, r); |
|
| 143 |
if (std::sqrt(off) < 1e-13) break; |
|
| 144 |
detail::jacobi_sweep(A, eigvec); |
|
| 145 |
} |
|
| 146 |
eigval = A.diag(); |
|
| 147 | ||
| 148 |
// Sort ascending |
|
| 149 |
arma::uvec idx = arma::sort_index(eigval); |
|
| 150 |
eigval = eigval(idx); |
|
| 151 |
eigvec = eigvec.cols(idx); |
|
| 152 |
#endif |
|
| 153 |
} |
|
| 154 | ||
| 155 |
/// @} |
|
| 156 | ||
| 157 |
/// @name QR Factorization |
|
| 158 |
/// @{
|
|
| 159 | ||
| 160 |
// ── qr_full ─────────────────────────────────────────────────────────────────── |
|
| 161 | ||
| 162 |
/// @brief Full (square) QR factorization of an n×p matrix. |
|
| 163 |
/// |
|
| 164 |
/// Produces a square n×n orthogonal factor @p Q and an n×p upper-trapezoidal |
|
| 165 |
/// factor @p R such that @c A = Q*R. This matches the convention of |
|
| 166 |
/// @c arma::qr(), which returns the full (non-economy) @p Q. |
|
| 167 |
/// |
|
| 168 |
/// @param[out] Q n×n orthogonal matrix. |
|
| 169 |
/// @param[out] R n×p upper-trapezoidal matrix. |
|
| 170 |
/// @param[in] A n×p input matrix (not modified). |
|
| 171 |
/// |
|
| 172 |
/// @warning With @c ARMA_USE_LAPACK defined this calls @c arma::qr directly. |
|
| 173 |
/// Without LAPACK the fallback constructs the full ONB by seeding |
|
| 174 |
/// columns beyond @c A.n_cols with standard basis vectors and then |
|
| 175 |
/// applying modified Gram-Schmidt; numerical quality degrades for |
|
| 176 |
/// nearly rank-deficient inputs. |
|
| 177 |
/// |
|
| 178 |
/// @note The caller (@c orthogonal_svd) depends on @p Q being n×n so that |
|
| 179 |
/// @c Q.cols(k, n-1) spans the null space of @c A^T. |
|
| 180 | 8x |
inline void qr_full(arma::mat& Q, arma::mat& R, const arma::mat& A) {
|
| 181 |
#if defined(ARMA_USE_LAPACK) |
|
| 182 | 8x |
arma::qr(Q, R, A); |
| 183 |
#else |
|
| 184 |
// Build a full n×n orthogonal matrix. |
|
| 185 |
// Step 1: thin QR via modified Gram-Schmidt (gives first k=A.n_cols columns). |
|
| 186 |
// Step 2: extend to a full ONB using random vectors + re-orthogonalisation. |
|
| 187 |
const arma::uword n = A.n_rows, p = A.n_cols; |
|
| 188 | ||
| 189 |
// Start with all n columns of A padded with random vectors if n > p. |
|
| 190 |
arma::mat Qfull(n, n, arma::fill::zeros); |
|
| 191 |
// Copy A's columns first |
|
| 192 |
for (arma::uword j = 0; j < p && j < n; ++j) |
|
| 193 |
Qfull.col(j) = A.col(j); |
|
| 194 |
// Fill remaining columns with standard basis vectors (or random fallback) |
|
| 195 |
for (arma::uword j = p; j < n; ++j) {
|
|
| 196 |
Qfull(j, j) = 1.0; ///< standard basis vector e_j used as seed |
|
| 197 |
} |
|
| 198 |
// Full MGS orthonormalization |
|
| 199 |
detail::mgs(Qfull); |
|
| 200 |
Q = Qfull; |
|
| 201 | ||
| 202 |
// R is n×p (upper trapezoidal); only first p rows are non-zero |
|
| 203 |
R = Q.t() * A; |
|
| 204 |
// Zero the strictly lower-triangular part of R[0:p, 0:p] |
|
| 205 |
for (arma::uword j = 0; j < p; ++j) |
|
| 206 |
for (arma::uword i = j + 1; i < n; ++i) |
|
| 207 |
R(i, j) = 0.0; |
|
| 208 |
#endif |
|
| 209 |
} |
|
| 210 | ||
| 211 |
/// @} |
|
| 212 | ||
| 213 |
/// @name Singular Value Decomposition |
|
| 214 |
/// @{
|
|
| 215 | ||
| 216 |
// ── svd ─────────────────────────────────────────────────────────────────────── |
|
| 217 | ||
| 218 |
/// @brief Full SVD: @c A ≈ U * diag(s) * V', singular values descending. |
|
| 219 |
/// |
|
| 220 |
/// Decomposes the n×p matrix @p A into left singular vectors @p U, singular |
|
| 221 |
/// values @p s, and right singular vectors @p V. |
|
| 222 |
/// |
|
| 223 |
/// @param[out] U n×min(n,p) matrix of left singular vectors. |
|
| 224 |
/// @param[out] s min(n,p)×1 vector of singular values, descending. |
|
| 225 |
/// @param[out] V p×p matrix of right singular vectors (all p columns). |
|
| 226 |
/// @param[in] A n×p input matrix (not modified). |
|
| 227 |
/// |
|
| 228 |
/// @warning With @c ARMA_USE_LAPACK defined this calls @c arma::svd directly. |
|
| 229 |
/// Without LAPACK the fallback computes @c A^T*A, calls @c eig_sym, |
|
| 230 |
/// and derives @p U via @c U[:,j] = A*V[:,j]/s_j. Columns with |
|
| 231 |
/// @c s_j < 1e-14 are zeroed. Accuracy is governed by the |
|
| 232 |
/// conditioning of @c A^T*A (squares the condition number of @p A). |
|
| 233 |
/// |
|
| 234 |
/// @note Fallback algorithm: eigendecomposition of \f$A^T A\f$ via cyclic |
|
| 235 |
/// Jacobi (Demmel & Veselić 1992). Suitable for the small, moderately |
|
| 236 |
/// conditioned matrices typical in ENA (p ≤ ~100). |
|
| 237 | 39x |
inline void svd(arma::mat& U, arma::vec& s, arma::mat& V, const arma::mat& A) {
|
| 238 |
#if defined(ARMA_USE_LAPACK) |
|
| 239 | 39x |
arma::svd(U, s, V, A); |
| 240 |
#else |
|
| 241 |
// One-sided Jacobi on A^T: |
|
| 242 |
// We work with B = A^T (p×n). The right singular vectors of A are the |
|
| 243 |
// left singular vectors of B (= columns of U_B after Jacobi on B^T B). |
|
| 244 |
// We use the eigendecomposition of A^T * A to get V and s. |
|
| 245 |
const arma::uword n = A.n_rows, p = A.n_cols; |
|
| 246 |
const arma::uword k = std::min(n, p); |
|
| 247 | ||
| 248 |
arma::mat AtA = A.t() * A; ///< p×p symmetric Gram matrix |
|
| 249 |
arma::vec eigval; |
|
| 250 |
arma::mat eigvec; |
|
| 251 |
eig_sym(eigval, eigvec, AtA); |
|
| 252 | ||
| 253 |
// Eigenvalues from eig_sym are ascending; we want descending. |
|
| 254 |
arma::uvec idx = arma::sort_index(eigval, "descend"); |
|
| 255 |
eigval = eigval(idx); |
|
| 256 |
eigvec = eigvec.cols(idx); |
|
| 257 | ||
| 258 |
// Singular values (non-negative square roots) |
|
| 259 |
s.set_size(k); |
|
| 260 |
for (arma::uword j = 0; j < k; ++j) |
|
| 261 |
s(j) = std::sqrt(std::max(0.0, eigval(j))); |
|
| 262 | ||
| 263 |
// V = eigvec (right singular vectors of A are eigvectors of A^T*A) |
|
| 264 |
V = eigvec; |
|
| 265 | ||
| 266 |
// U = A * V / s (left singular vectors, only first k columns needed) |
|
| 267 |
U.set_size(n, k); |
|
| 268 |
for (arma::uword j = 0; j < k; ++j) {
|
|
| 269 |
if (s(j) > 1e-14) |
|
| 270 |
U.col(j) = A * V.col(j) / s(j); |
|
| 271 |
else |
|
| 272 |
U.col(j).zeros(); |
|
| 273 |
} |
|
| 274 |
#endif |
|
| 275 |
} |
|
| 276 | ||
| 277 |
// ── svd_right ───────────────────────────────────────────────────────────────── |
|
| 278 | ||
| 279 |
/// @brief Singular values and right singular vectors only (left vectors skipped). |
|
| 280 |
/// |
|
| 281 |
/// A lighter-weight variant of @c svd() for callers that only need @p s and |
|
| 282 |
/// @p V. Under LAPACK the full @c arma::svd is still called internally; the |
|
| 283 |
/// savings come in WASM builds where computing @p U is avoided. |
|
| 284 |
/// |
|
| 285 |
/// @param[out] s min(n,p)×1 vector of singular values, descending. |
|
| 286 |
/// @param[out] V p×p matrix of right singular vectors. |
|
| 287 |
/// @param[in] A n×p input matrix (not modified). |
|
| 288 |
/// |
|
| 289 |
/// @warning With @c ARMA_USE_LAPACK defined this still invokes the full |
|
| 290 |
/// @c arma::svd; the left-vector output is discarded. |
|
| 291 |
inline void svd_right(arma::vec& s, arma::mat& V, const arma::mat& A) {
|
|
| 292 |
#if defined(ARMA_USE_LAPACK) |
|
| 293 |
arma::mat U; |
|
| 294 |
arma::svd(U, s, V, A); |
|
| 295 |
#else |
|
| 296 |
arma::mat U_unused; |
|
| 297 |
svd(U_unused, s, V, A); |
|
| 298 |
#endif |
|
| 299 |
} |
|
| 300 | ||
| 301 |
/// @} |
|
| 302 | ||
| 303 |
/// @name Linear System Solve |
|
| 304 |
/// @{
|
|
| 305 | ||
| 306 |
// ── solve_spd ───────────────────────────────────────────────────────────────── |
|
| 307 | ||
| 308 |
/// @brief Solve @c A*x = b for symmetric positive (semi-)definite @p A. |
|
| 309 |
/// |
|
| 310 |
/// Returns the n×k solution matrix @p x given the n×n system matrix @p A and |
|
| 311 |
/// the n×k right-hand side @p b. Near-zero eigenvalues are handled gracefully, |
|
| 312 |
/// making this a pseudo-inverse solve safe for rank-deficient Gram matrices. |
|
| 313 |
/// |
|
| 314 |
/// @param[in] A Real symmetric positive (semi-)definite n×n matrix. |
|
| 315 |
/// @param[in] b n×k right-hand side matrix. |
|
| 316 |
/// @param[in] tol Relative threshold below which eigenvalues are treated as |
|
| 317 |
/// zero. Eigenvalues with @c |λ| < tol * max(|λ|) are |
|
| 318 |
/// inverted as zero (i.e. those directions are projected out). |
|
| 319 |
/// Default: 1e-12. |
|
| 320 |
/// @returns n×k solution matrix @p x satisfying @c A*x ≈ b. |
|
| 321 |
/// |
|
| 322 |
/// @warning With @c ARMA_USE_LAPACK defined this delegates to |
|
| 323 |
/// @c arma::solve(A, b, arma::solve_opts::fast), which uses a |
|
| 324 |
/// Cholesky factorization and does **not** handle rank deficiency. |
|
| 325 |
/// Without LAPACK the eigendecomposition-based pseudo-inverse is used, |
|
| 326 |
/// which is safe for rank-deficient inputs but slower. |
|
| 327 |
/// |
|
| 328 |
/// @note Without LAPACK: @c A = V * diag(λ) * V^T is computed via @c eig_sym; |
|
| 329 |
/// the solution is @c x = V * diag(1/λ) * V^T * b with small λ skipped. |
|
| 330 | 8x |
inline arma::mat solve_spd(const arma::mat& A, const arma::mat& b, |
| 331 |
double tol = 1e-12) {
|
|
| 332 |
#if defined(ARMA_USE_LAPACK) |
|
| 333 | 8x |
return arma::solve(A, b, arma::solve_opts::fast); |
| 334 |
#else |
|
| 335 |
arma::vec eigval; |
|
| 336 |
arma::mat eigvec; |
|
| 337 |
eig_sym(eigval, eigvec, A); ///< A = eigvec * diag(eigval) * eigvec^T |
|
| 338 | ||
| 339 |
const double thresh = tol * std::abs(eigval.max()); |
|
| 340 |
arma::vec inv_eigval(eigval.n_elem, arma::fill::zeros); |
|
| 341 |
for (arma::uword i = 0; i < eigval.n_elem; ++i) |
|
| 342 |
if (std::abs(eigval(i)) > thresh) |
|
| 343 |
inv_eigval(i) = 1.0 / eigval(i); |
|
| 344 | ||
| 345 |
// x = eigvec * diag(inv_eigval) * eigvec^T * b |
|
| 346 |
return eigvec * arma::diagmat(inv_eigval) * eigvec.t() * b; |
|
| 347 |
#endif |
|
| 348 |
} |
|
| 349 | ||
| 350 |
/// @} |
|
| 351 | ||
| 352 |
} // namespace linalg |
|
| 353 |
} // namespace qe |
|
| 354 | ||
| 355 |
#endif // LIBQE_LINALG_FALLBACK_HPP |
| 1 |
/** |
|
| 2 |
* @file modeling.hpp |
|
| 3 |
* @brief ENA modeling utilities: centering, correlation, confidence intervals, |
|
| 4 |
* and node-position solvers for undirected and directed ENA networks. |
|
| 5 |
*/ |
|
| 6 |
#ifndef LIBQE_MODELING_HPP |
|
| 7 |
#define LIBQE_MODELING_HPP |
|
| 8 | ||
| 9 |
#include <armadillo> |
|
| 10 |
#include <cmath> |
|
| 11 |
#include <libqe/linalg_fallback.hpp> |
|
| 12 | ||
| 13 |
namespace qe {
|
|
| 14 | ||
| 15 |
// --------------------------------------------------------------------------- |
|
| 16 |
/// @name Return types |
|
| 17 |
/// @{
|
|
| 18 |
// --------------------------------------------------------------------------- |
|
| 19 | ||
| 20 |
/** |
|
| 21 |
* @brief Aggregated result returned by every node-position solver. |
|
| 22 |
* |
|
| 23 |
* All matrices share the same dimensional conventions: |
|
| 24 |
* rows index units (or nodes), columns index rotated dimensions. |
|
| 25 |
*/ |
|
| 26 |
struct NodePositions {
|
|
| 27 |
arma::mat nodes; ///< Solved node locations, n_codes × n_dims. |
|
| 28 |
arma::mat centroids; ///< Per-unit centroids projected onto the node space, n_units × n_dims. |
|
| 29 |
arma::mat weights; ///< Normalised half-edge weight per unit per node, n_units × n_codes. |
|
| 30 |
arma::mat points; ///< Input rotated points echoed back unchanged, n_units × n_dims. |
|
| 31 |
}; |
|
| 32 | ||
| 33 |
/// @} |
|
| 34 | ||
| 35 |
// --------------------------------------------------------------------------- |
|
| 36 |
/// @name Centering |
|
| 37 |
/// @{
|
|
| 38 |
// --------------------------------------------------------------------------- |
|
| 39 | ||
| 40 |
/** |
|
| 41 |
* @brief Subtract column means so that every dimension is centred at the origin. |
|
| 42 |
* |
|
| 43 |
* @param[in] values Matrix of ENA unit points (n_units × n_dims). |
|
| 44 |
* @returns A copy of @p values with each column mean subtracted (centre-to-origin). |
|
| 45 |
* |
|
| 46 |
* @note Equivalent to `center_data_c()` in rENA/ena.cpp. |
|
| 47 |
*/ |
|
| 48 | 2x |
inline arma::mat center_points(arma::mat values) {
|
| 49 | 4x |
return values.each_row() - arma::mean(values); |
| 50 |
} |
|
| 51 | ||
| 52 |
/// @} |
|
| 53 | ||
| 54 |
/// @cond INTERNAL |
|
| 55 | ||
| 56 |
// --------------------------------------------------------------------------- |
|
| 57 |
// Normal quantile (probit) — pure C++, no R dependency |
|
| 58 |
// Rational approximation by Peter Acklam; max abs error < 1.15e-9. |
|
| 59 |
// Used internally by ena_correlation() so we don't need Rcpp::qnorm. |
|
| 60 |
// --------------------------------------------------------------------------- |
|
| 61 | ||
| 62 | 16x |
inline double normal_quantile(double p) {
|
| 63 |
static const double a[] = {
|
|
| 64 |
-3.969683028665376e+01, 2.209460984245205e+02, |
|
| 65 |
-2.759285104469687e+02, 1.383577518672690e+02, |
|
| 66 |
-3.066479806614716e+01, 2.506628277459239e+00 |
|
| 67 |
}; |
|
| 68 |
static const double b[] = {
|
|
| 69 |
-5.447609879822406e+01, 1.615858368580409e+02, |
|
| 70 |
-1.556989798598866e+02, 6.680131188771972e+01, |
|
| 71 |
-1.328068155288572e+01 |
|
| 72 |
}; |
|
| 73 |
static const double c[] = {
|
|
| 74 |
-7.784894002430293e-03, -3.223964580411365e-01, |
|
| 75 |
-2.400758277161838e+00, -2.549732539343734e+00, |
|
| 76 |
4.374664141464968e+00, 2.938163982698783e+00 |
|
| 77 |
}; |
|
| 78 |
static const double d[] = {
|
|
| 79 |
7.784695709041462e-03, 3.224671290700398e-01, |
|
| 80 |
2.445134137142996e+00, 3.754408661907416e+00 |
|
| 81 |
}; |
|
| 82 | 16x |
const double p_low = 0.02425; |
| 83 | 16x |
const double p_high = 1.0 - p_low; |
| 84 |
double q, r; |
|
| 85 | 16x |
if (p < p_low) {
|
| 86 | ! |
q = std::sqrt(-2.0 * std::log(p)); |
| 87 | ! |
return (((((c[0]*q+c[1])*q+c[2])*q+c[3])*q+c[4])*q+c[5]) / |
| 88 | ! |
((((d[0]*q+d[1])*q+d[2])*q+d[3])*q+1.0); |
| 89 | 16x |
} else if (p <= p_high) {
|
| 90 | 15x |
q = p - 0.5; r = q * q; |
| 91 | 15x |
return (((((a[0]*r+a[1])*r+a[2])*r+a[3])*r+a[4])*r+a[5])*q / |
| 92 | 15x |
(((((b[0]*r+b[1])*r+b[2])*r+b[3])*r+b[4])*r+1.0); |
| 93 |
} else {
|
|
| 94 | 1x |
q = std::sqrt(-2.0 * std::log(1.0 - p)); |
| 95 | 1x |
return -(((((c[0]*q+c[1])*q+c[2])*q+c[3])*q+c[4])*q+c[5]) / |
| 96 | 1x |
((((d[0]*q+d[1])*q+d[2])*q+d[3])*q+1.0); |
| 97 |
} |
|
| 98 |
} |
|
| 99 | ||
| 100 |
// --------------------------------------------------------------------------- |
|
| 101 |
// Internal helpers: regularized incomplete beta + t-distribution quantile |
|
| 102 |
// --------------------------------------------------------------------------- |
|
| 103 |
// |
|
| 104 |
// These live in qe::detail so they don't pollute the public qe namespace. |
|
| 105 |
// They are used by mean_ci() and are not part of the public API. |
|
| 106 | ||
| 107 |
namespace detail {
|
|
| 108 | ||
| 109 |
// Lentz continued-fraction evaluation for the regularized incomplete beta. |
|
| 110 |
// Ported from Numerical Recipes in C, §6.4. |
|
| 111 | 52x |
inline double betacf(double a, double b, double x) {
|
| 112 | 52x |
const int MAXIT = 200; |
| 113 | 52x |
const double EPS = std::numeric_limits<double>::epsilon(); |
| 114 | 52x |
const double FPMIN = std::numeric_limits<double>::min() / EPS; |
| 115 | 52x |
double qab = a + b, qap = a + 1.0, qam = a - 1.0; |
| 116 | 52x |
double c = 1.0; |
| 117 | 52x |
double d = 1.0 - qab * x / qap; |
| 118 |
if (std::abs(d) < FPMIN) d = FPMIN; |
|
| 119 | 52x |
d = 1.0 / d; |
| 120 | 52x |
double h = d; |
| 121 | 755x |
for (int m = 1; m <= MAXIT; ++m) {
|
| 122 | 755x |
const int m2 = 2 * m; |
| 123 |
// even step |
|
| 124 | 755x |
double aa = static_cast<double>(m) * (b - static_cast<double>(m)) * x |
| 125 | 755x |
/ ((qam + m2) * (a + m2)); |
| 126 |
d = 1.0 + aa * d; if (std::abs(d) < FPMIN) d = FPMIN; |
|
| 127 |
c = 1.0 + aa / c; if (std::abs(c) < FPMIN) c = FPMIN; |
|
| 128 | 755x |
d = 1.0 / d; h *= d * c; |
| 129 |
// odd step |
|
| 130 | 755x |
aa = -(a + static_cast<double>(m)) * (qab + static_cast<double>(m)) * x |
| 131 | 755x |
/ ((a + m2) * (qap + m2)); |
| 132 |
d = 1.0 + aa * d; if (std::abs(d) < FPMIN) d = FPMIN; |
|
| 133 |
c = 1.0 + aa / c; if (std::abs(c) < FPMIN) c = FPMIN; |
|
| 134 | 755x |
d = 1.0 / d; |
| 135 | 755x |
const double del = d * c; |
| 136 | 755x |
h *= del; |
| 137 | 755x |
if (std::abs(del - 1.0) <= EPS) break; |
| 138 |
} |
|
| 139 | 52x |
return h; |
| 140 |
} |
|
| 141 | ||
| 142 |
// Regularized incomplete beta I_x(a, b). |
|
| 143 | 52x |
inline double betai(double a, double b, double x) {
|
| 144 |
if (x <= 0.0) return 0.0; |
|
| 145 |
if (x >= 1.0) return 1.0; |
|
| 146 | 52x |
const double lbab = std::lgamma(a + b) - std::lgamma(a) - std::lgamma(b); |
| 147 | 52x |
const double bt = std::exp(lbab + a * std::log(x) + b * std::log(1.0 - x)); |
| 148 | 52x |
if (x < (a + 1.0) / (a + b + 2.0)) |
| 149 | 42x |
return bt * betacf(a, b, x) / a; |
| 150 | 10x |
return 1.0 - bt * betacf(b, a, 1.0 - x) / b; |
| 151 |
} |
|
| 152 | ||
| 153 |
// Quantile function (inverse CDF) for the t-distribution with `df` degrees |
|
| 154 |
// of freedom. Uses the relation: |
|
| 155 |
// |
|
| 156 |
// P(T ≤ t | df) = 1 - I_x(df/2, 1/2) / 2, x = df / (df + t^2) |
|
| 157 |
// |
|
| 158 |
// Inverted via Newton–Raphson on I_x with bisection fallback; normal-quantile |
|
| 159 |
// plus one Cornish–Fisher correction term provides the starting guess. |
|
| 160 | 14x |
inline double t_quantile(double p, double df) {
|
| 161 | 14x |
const double INF = std::numeric_limits<double>::infinity(); |
| 162 |
if (p <= 0.0) return -INF; |
|
| 163 |
if (p >= 1.0) return INF; |
|
| 164 |
if (p == 0.5) return 0.0; |
|
| 165 | 14x |
if (df <= 0.0) return INF; // df = 0 → t-distribution undefined → treat as ∞ |
| 166 | ||
| 167 |
// Work with pp > 0.5 for numerical stability; negate at the end if needed. |
|
| 168 | 13x |
const bool flip = (p < 0.5); |
| 169 |
const double pp = flip ? 1.0 - p : p; |
|
| 170 | 13x |
const double a = 0.5 * df; |
| 171 | 13x |
const double bv = 0.5; |
| 172 | ||
| 173 |
// We need x in (0,1) such that I_x(a, bv) = 2*(1-pp). |
|
| 174 | 13x |
const double target = 2.0 * (1.0 - pp); |
| 175 | ||
| 176 |
// Initial guess via normal quantile + single Cornish–Fisher correction. |
|
| 177 | 13x |
const double z = normal_quantile(pp); |
| 178 | 13x |
const double t0 = z + (z * z * z + z) / (4.0 * df); |
| 179 | 13x |
double x = df / (df + t0 * t0); |
| 180 | 13x |
x = std::max(1e-12, std::min(1.0 - 1e-12, x)); |
| 181 | ||
| 182 |
// Newton–Raphson with a bisection bracket. |
|
| 183 |
// f(x) = I_x(a,bv) - target; f'(x) = x^(a-1)*(1-x)^(bv-1) / B(a,bv) |
|
| 184 | 13x |
const double lbab = std::lgamma(a + bv) - std::lgamma(a) - std::lgamma(bv); |
| 185 | 13x |
double xlo = 0.0, xhi = 1.0; |
| 186 | 52x |
for (int iter = 0; iter < 100; ++iter) {
|
| 187 | 52x |
const double fx = betai(a, bv, x) - target; |
| 188 |
// betai is increasing in x, so fx < 0 → x too small → raise lower bound |
|
| 189 | 52x |
if (fx < 0.0) xlo = x; else xhi = x; |
| 190 | ||
| 191 |
// Derivative: may underflow near boundaries — fall back to bisection. |
|
| 192 | 52x |
const double log_fpx = lbab + (a - 1.0) * std::log(x) |
| 193 | 52x |
+ (bv - 1.0) * std::log(1.0 - x); |
| 194 |
const double fpx = (log_fpx > -700.0) ? std::exp(log_fpx) : 0.0; |
|
| 195 | ||
| 196 |
double x_new; |
|
| 197 | 52x |
if (fpx > 0.0) {
|
| 198 | 52x |
x_new = x - fx / fpx; |
| 199 | 52x |
if (x_new <= xlo || x_new >= xhi) |
| 200 | 2x |
x_new = 0.5 * (xlo + xhi); |
| 201 |
} else {
|
|
| 202 | ! |
x_new = 0.5 * (xlo + xhi); |
| 203 |
} |
|
| 204 | ||
| 205 | 52x |
if (std::abs(x_new - x) < 1e-13 * x) break; |
| 206 | 39x |
x = x_new; |
| 207 |
} |
|
| 208 | ||
| 209 | 13x |
const double t_val = std::sqrt(df * (1.0 - x) / x); |
| 210 |
return flip ? -t_val : t_val; |
|
| 211 |
} |
|
| 212 | ||
| 213 |
// --------------------------------------------------------------------------- |
|
| 214 |
// IQR helper — matches R's quantile(type = 7) (Hyndman–Fan #7), which is |
|
| 215 |
// identical to numpy's np.percentile(method="linear"). |
|
| 216 |
// |
|
| 217 |
// For a sorted n-element vector and probability p: |
|
| 218 |
// h = (n - 1) * p |
|
| 219 |
// lo = floor(h), frac = h - lo |
|
| 220 |
// Q(p) = sorted[lo] * (1 - frac) + sorted[lo+1] * frac |
|
| 221 |
// --------------------------------------------------------------------------- |
|
| 222 | ||
| 223 | 22x |
inline double quantile_type7(const arma::vec& sorted, double p) {
|
| 224 | 22x |
const int n = static_cast<int>(sorted.n_elem); |
| 225 |
if (n == 0) return std::numeric_limits<double>::quiet_NaN(); |
|
| 226 |
if (n == 1) return sorted[0]; |
|
| 227 | 22x |
const double h = (n - 1) * p; |
| 228 | 22x |
const int lo = static_cast<int>(std::floor(h)); |
| 229 | 22x |
const double frac = h - lo; |
| 230 |
if (lo + 1 >= n) return sorted[n - 1]; // guard: p == 1.0 |
|
| 231 | 66x |
return sorted[lo] * (1.0 - frac) + sorted[lo + 1] * frac; |
| 232 |
} |
|
| 233 | ||
| 234 |
// Interquartile range for a column vector — uses the same type-7 quantile as R. |
|
| 235 | 11x |
inline double iqr(const arma::vec& col) {
|
| 236 | 11x |
const arma::vec s = arma::sort(col); |
| 237 | 22x |
return quantile_type7(s, 0.75) - quantile_type7(s, 0.25); |
| 238 |
} |
|
| 239 | ||
| 240 |
} // namespace detail |
|
| 241 | ||
| 242 |
/// @endcond |
|
| 243 | ||
| 244 |
// --------------------------------------------------------------------------- |
|
| 245 |
/// @name Correlation |
|
| 246 |
/// @{
|
|
| 247 |
// --------------------------------------------------------------------------- |
|
| 248 | ||
| 249 |
/** |
|
| 250 |
* @brief Pearson correlation with Fisher-z confidence interval between ENA |
|
| 251 |
* unit points and their centroids. |
|
| 252 |
* |
|
| 253 |
* All unique pairs of units are formed; for each pair the per-dimension |
|
| 254 |
* difference vectors are computed, then Pearson r is obtained between the |
|
| 255 |
* @p points differences and the @p centroids differences. The confidence |
|
| 256 |
* interval is derived via Fisher's z-transform using the pure-C++ |
|
| 257 |
* `normal_quantile` function so the function works outside R without |
|
| 258 |
* `Rcpp::qnorm`. |
|
| 259 |
* |
|
| 260 |
* @param[in] points Rotated ENA unit points, n_units × n_dims. |
|
| 261 |
* @param[in] centroids Corresponding centroid coordinates, n_units × n_dims. |
|
| 262 |
* @param[in] conf_level Confidence level for the interval (default 0.95). |
|
| 263 |
* @returns An n_dims × 3 matrix whose columns are [r, ci_lower, ci_upper]. |
|
| 264 |
* |
|
| 265 |
* @note Equivalent to `ena_correlation()` in rENA/ena.cpp. |
|
| 266 |
*/ |
|
| 267 | 3x |
inline arma::mat ena_correlation(arma::mat points, arma::mat centroids, |
| 268 |
double conf_level = 0.95) {
|
|
| 269 | 3x |
int n = points.n_rows; |
| 270 | 3x |
int n_pairs = (n * (n - 1)) / 2; |
| 271 | ||
| 272 | 3x |
arma::umat idx1(1, n_pairs), idx2(1, n_pairs); |
| 273 | 3x |
int col = 0; |
| 274 | 43x |
for (int i = 0; i < n; i++) |
| 275 | 600x |
for (int j = i + 1; j < n; j++) { idx1[col] = i; idx2[col] = j; col++; }
|
| 276 | ||
| 277 | 6x |
arma::mat pts_diff = points.rows(idx1.row(0)) - points.rows(idx2.row(0)); |
| 278 | 6x |
arma::mat cts_diff = centroids.rows(idx1.row(0)) - centroids.rows(idx2.row(0)); |
| 279 | 3x |
arma::mat cr = arma::cor(pts_diff, cts_diff); |
| 280 | ||
| 281 | 3x |
double qq = normal_quantile((1.0 + conf_level) / 2.0); |
| 282 | 3x |
arma::mat out(points.n_cols, 3); |
| 283 | 9x |
for (arma::uword i = 0; i < points.n_cols; i++) {
|
| 284 | 6x |
double r = cr(i, i); |
| 285 | 6x |
double z = std::atanh(r); |
| 286 | 6x |
double sigma = 1.0 / std::sqrt(static_cast<double>(n_pairs) - 3.0); |
| 287 | 6x |
out(i, 0) = r; |
| 288 | 6x |
out(i, 1) = std::tanh(z - sigma * qq); |
| 289 | 12x |
out(i, 2) = std::tanh(z + sigma * qq); |
| 290 |
} |
|
| 291 | 6x |
return out; |
| 292 |
} |
|
| 293 | ||
| 294 |
/// @} |
|
| 295 | ||
| 296 |
// --------------------------------------------------------------------------- |
|
| 297 |
/// @name Group confidence intervals |
|
| 298 |
/// @{
|
|
| 299 |
// --------------------------------------------------------------------------- |
|
| 300 | ||
| 301 |
/** |
|
| 302 |
* @brief Student-t confidence interval for the mean of a group of ENA unit points. |
|
| 303 |
* |
|
| 304 |
* For each dimension d: |
|
| 305 |
* @code |
|
| 306 |
* mean ± t_{α/2, n-1} × (sample_sd / sqrt(n))
|
|
| 307 |
* @endcode |
|
| 308 |
* where α = 1 − @p conf_level and degrees-of-freedom = n − 1. |
|
| 309 |
* |
|
| 310 |
* @param[in] points ENA unit points for a single group, n_units × n_dims. |
|
| 311 |
* @param[in] conf_level Confidence level for the interval (default 0.95). |
|
| 312 |
* @returns An n_dims × 3 matrix whose columns are [mean, ci_lower, ci_upper]. |
|
| 313 |
* When n == 0 all entries are NaN. |
|
| 314 |
* When n == 1 the CI bounds are ±Inf. |
|
| 315 |
*/ |
|
| 316 | 14x |
inline arma::mat mean_ci(const arma::mat& points, double conf_level = 0.95) {
|
| 317 | 14x |
const int n = static_cast<int>(points.n_rows); |
| 318 | 14x |
const int n_dims = static_cast<int>(points.n_cols); |
| 319 | ||
| 320 | 14x |
arma::mat out(n_dims, 3); |
| 321 | 14x |
out.fill(std::numeric_limits<double>::quiet_NaN()); |
| 322 | ||
| 323 |
if (n == 0) return out; |
|
| 324 | ||
| 325 | 14x |
const double df = static_cast<double>(n - 1); |
| 326 | 14x |
const double t_crit = detail::t_quantile(1.0 - (1.0 - conf_level) / 2.0, df); |
| 327 | 14x |
const double INF = std::numeric_limits<double>::infinity(); |
| 328 | ||
| 329 | 40x |
for (int d = 0; d < n_dims; ++d) {
|
| 330 | 52x |
const arma::vec col = points.col(d); |
| 331 | 26x |
const double mu = arma::mean(col); |
| 332 | 26x |
out(d, 0) = mu; |
| 333 | 26x |
if (n == 1) {
|
| 334 |
// t_crit = Inf and se = 0 would give NaN; explicitly set ±Inf |
|
| 335 | 2x |
out(d, 1) = -INF; |
| 336 | 4x |
out(d, 2) = INF; |
| 337 |
} else {
|
|
| 338 | 24x |
const double se = arma::stddev(col) / std::sqrt(static_cast<double>(n)); |
| 339 | 24x |
out(d, 1) = mu - t_crit * se; |
| 340 | 48x |
out(d, 2) = mu + t_crit * se; |
| 341 |
} |
|
| 342 |
} |
|
| 343 | 14x |
return out; |
| 344 |
} |
|
| 345 | ||
| 346 |
/** |
|
| 347 |
* @brief Tukey-fence outlier interval for a group of ENA unit points. |
|
| 348 |
* |
|
| 349 |
* For each dimension d the half-width is computed as: |
|
| 350 |
* @code |
|
| 351 |
* half_width[d] = iqr_factor * IQR(points[:, d]) |
|
| 352 |
* @endcode |
|
| 353 |
* and the interval [−half_width, +half_width] is symmetric around zero, |
|
| 354 |
* matching rENA's formula: |
|
| 355 |
* @code |
|
| 356 |
* outlier.interval.values = matrix(...) * c(-1, 1) |
|
| 357 |
* @endcode |
|
| 358 |
* where the matrix is built from `c(IQR(dim1), IQR(dim2), ...) * iqr_factor`. |
|
| 359 |
* |
|
| 360 |
* IQR uses R's default type-7 / Hyndman-Fan #7 quantile, identical to |
|
| 361 |
* `numpy.percentile(method="linear")`. |
|
| 362 |
* |
|
| 363 |
* @param[in] points ENA unit points for a single group, n_units × n_dims. |
|
| 364 |
* @param[in] iqr_factor Multiplier applied to the IQR (default 1.5, the |
|
| 365 |
* standard Tukey fence threshold). |
|
| 366 |
* @returns An n_dims × 2 matrix whose columns are [lower, upper]. |
|
| 367 |
* When n == 0 all entries are NaN. |
|
| 368 |
* When n == 1, IQR == 0, so lower == upper == 0. |
|
| 369 |
*/ |
|
| 370 | 6x |
inline arma::mat outlier_ci(const arma::mat& points, double iqr_factor = 1.5) {
|
| 371 | 6x |
const int n_dims = static_cast<int>(points.n_cols); |
| 372 | 6x |
arma::mat out(n_dims, 2, arma::fill::zeros); |
| 373 | ||
| 374 | 6x |
if (points.n_rows == 0) {
|
| 375 | 1x |
out.fill(std::numeric_limits<double>::quiet_NaN()); |
| 376 | 1x |
return out; |
| 377 |
} |
|
| 378 | ||
| 379 | 16x |
for (int d = 0; d < n_dims; ++d) {
|
| 380 | 22x |
const double half_width = iqr_factor * detail::iqr(points.col(d)); |
| 381 | 11x |
out(d, 0) = -half_width; |
| 382 | 22x |
out(d, 1) = half_width; |
| 383 |
} |
|
| 384 | 5x |
return out; |
| 385 |
} |
|
| 386 | ||
| 387 |
/// @} |
|
| 388 | ||
| 389 |
// --------------------------------------------------------------------------- |
|
| 390 |
/// @name Node-position solvers |
|
| 391 |
/// @{
|
|
| 392 |
// --------------------------------------------------------------------------- |
|
| 393 | ||
| 394 |
/** |
|
| 395 |
* @brief Multiobjective least-squares node positions for undirected ENA. |
|
| 396 |
* |
|
| 397 |
* Half of each adjacency-vector entry (line weight) is distributed to each of |
|
| 398 |
* its two endpoint nodes, building a per-unit weight matrix. Each row is then |
|
| 399 |
* L1-normalised. An overdetermined system is solved per dimension: |
|
| 400 |
* @code |
|
| 401 |
* (W^T W) X = W^T T |
|
| 402 |
* @endcode |
|
| 403 |
* where W is the normalised weight matrix and T contains the rotated unit |
|
| 404 |
* points. |
|
| 405 |
* |
|
| 406 |
* @param[in] adj_mats Upper-triangular adjacency vectors stacked row-wise, |
|
| 407 |
* n_units × tri_size. |
|
| 408 |
* @param[in] t Rotated unit points, n_units × n_dims. |
|
| 409 |
* @param[in] num_dims Number of dimensions to solve for. |
|
| 410 |
* @returns A @ref NodePositions struct containing `nodes`, `centroids`, |
|
| 411 |
* `weights`, and `points`. |
|
| 412 |
* |
|
| 413 |
* @note Equivalent to `lws_lsq_positions()` in rENA/ena.cpp. |
|
| 414 |
*/ |
|
| 415 | 2x |
inline NodePositions node_positions(arma::mat adj_mats, arma::mat t, |
| 416 |
int num_dims) {
|
|
| 417 | 2x |
int tri_size = adj_mats.n_cols; |
| 418 | 2x |
int num_nodes = static_cast<int>( |
| 419 | 2x |
std::pow(std::ceil(std::sqrt(static_cast<double>(2 * tri_size))), 2.0) |
| 420 | 2x |
) - (2 * tri_size); |
| 421 | 2x |
int row_count = adj_mats.n_rows; |
| 422 | ||
| 423 | 2x |
arma::mat weights(row_count, num_nodes, arma::fill::zeros); |
| 424 | 13x |
for (int k = 0; k < row_count; k++) {
|
| 425 | 22x |
arma::rowvec curr = adj_mats.row(k); |
| 426 | 11x |
int z = 0; |
| 427 | 38x |
for (int x = 0; x < num_nodes - 1; x++) {
|
| 428 | 75x |
for (int y = 0; y <= x; y++) {
|
| 429 | 96x |
weights(k, x + 1) += 0.5 * curr[z]; |
| 430 | 96x |
weights(k, y) += 0.5 * curr[z]; |
| 431 | 48x |
z++; |
| 432 |
} |
|
| 433 |
} |
|
| 434 |
} |
|
| 435 | 13x |
for (int k = 0; k < row_count; k++) {
|
| 436 | 22x |
double len = arma::accu(arma::abs(weights.row(k))); |
| 437 |
if (len < 0.0001) len = 0.0001; |
|
| 438 | 22x |
weights.row(k) /= len; |
| 439 |
} |
|
| 440 | ||
| 441 | 2x |
arma::mat ssX(num_dims, num_nodes, arma::fill::zeros); |
| 442 | 2x |
arma::mat ssA = weights.t() * weights; |
| 443 | 6x |
for (int i = 0; i < num_dims; i++) {
|
| 444 | 8x |
arma::mat ssb = weights.t() * t.col(i); |
| 445 | 12x |
ssX.row(i) = qe::linalg::solve_spd(ssA, ssb).t(); |
| 446 |
} |
|
| 447 | ||
| 448 | 2x |
NodePositions r; |
| 449 | 2x |
r.nodes = ssX.t(); |
| 450 | 2x |
r.centroids = (ssX * weights.t()).t(); |
| 451 | 2x |
r.weights = weights; |
| 452 | 2x |
r.points = t; |
| 453 | 4x |
return r; |
| 454 |
} |
|
| 455 | ||
| 456 |
/** |
|
| 457 |
* @brief Least-squares node positions for directed (ordered) ENA. |
|
| 458 |
* |
|
| 459 |
* Each row of @p line_weights is an n_nodes × n_nodes directed weight matrix |
|
| 460 |
* stored in row-major order. The per-unit node weight for node x accumulates |
|
| 461 |
* the full row weight plus all column weights directed at x from other nodes. |
|
| 462 |
* Each row is L1-normalised before solving. |
|
| 463 |
* |
|
| 464 |
* When @p combine_pairs is `false` (the standard directed case) the system: |
|
| 465 |
* @code |
|
| 466 |
* (W^T W) X = W^T P |
|
| 467 |
* @endcode |
|
| 468 |
* is solved directly with W = normalised weight matrix and P = @p points. |
|
| 469 |
* |
|
| 470 |
* When @p combine_pairs is `true`, adjacent row pairs (ground row k and |
|
| 471 |
* response row k+1) are summed before solving: |
|
| 472 |
* @code |
|
| 473 |
* W_combined[k/2] = W[k] + W[k+1] |
|
| 474 |
* P_combined[k/2] = P[k] + P[k+1] |
|
| 475 |
* @endcode |
|
| 476 |
* The system is then solved on the combined matrices, but centroids are |
|
| 477 |
* projected back using the original (un-combined) weight matrix so that |
|
| 478 |
* every unit retains its own centroid. |
|
| 479 |
* |
|
| 480 |
* @param[in] line_weights Directed adjacency vectors, n_units × (n_nodes²). |
|
| 481 |
* @param[in] points Rotated unit points, n_units × n_dims. |
|
| 482 |
* @param[in] num_dims Number of dimensions to solve for. |
|
| 483 |
* @param[in] combine_pairs If `true`, sum adjacent row pairs before solving |
|
| 484 |
* (ground + response model). Default `false`. |
|
| 485 |
* @returns A @ref NodePositions struct containing `nodes`, `centroids`, |
|
| 486 |
* `weights`, and `points`. |
|
| 487 |
* |
|
| 488 |
* @note Equivalent to `directed_node_positions()` in rENA/ena.cpp when |
|
| 489 |
* @p combine_pairs is `false`. |
|
| 490 |
* @note Equivalent to `directed_node_positions_with_ground_response_added()` |
|
| 491 |
* in rENA/ena.cpp when @p combine_pairs is `true`. |
|
| 492 |
* |
|
| 493 |
* @warning When @p combine_pairs is `true`, @p line_weights must have an even |
|
| 494 |
* number of rows (row_count must be even); odd row counts result in |
|
| 495 |
* an out-of-bounds access when forming the combined matrices. |
|
| 496 |
*/ |
|
| 497 | 2x |
inline NodePositions directed_node_positions(arma::mat line_weights, |
| 498 |
arma::mat points, int num_dims, |
|
| 499 |
bool combine_pairs = false) {
|
|
| 500 |
int num_nodes = static_cast<int>( |
|
| 501 | 2x |
std::ceil(std::sqrt(static_cast<double>(line_weights.n_cols))) |
| 502 |
); |
|
| 503 | 2x |
int row_count = line_weights.n_rows; |
| 504 | ||
| 505 | 2x |
arma::mat nw(row_count, num_nodes, arma::fill::zeros); |
| 506 | 12x |
for (int k = 0; k < row_count; k++) {
|
| 507 | 20x |
arma::mat curr = line_weights.row(k); |
| 508 | 10x |
int z = 0; |
| 509 | 40x |
for (int x = 0; x < num_nodes; x++) |
| 510 | 120x |
for (int y = 0; y < num_nodes; y++) {
|
| 511 | 180x |
nw(k, x) += curr[z]; |
| 512 | 210x |
if (x != y) nw(k, y) += curr[z]; |
| 513 | 90x |
z++; |
| 514 |
} |
|
| 515 |
} |
|
| 516 | 12x |
for (int k = 0; k < row_count; k++) {
|
| 517 | 20x |
double len = arma::accu(arma::abs(nw.row(k))); |
| 518 |
if (len < 0.0001) len = 0.0001; |
|
| 519 | 20x |
nw.row(k) /= len; |
| 520 |
} |
|
| 521 | ||
| 522 | 2x |
if (combine_pairs) {
|
| 523 |
// Combine paired rows (ground at k, response at k+1) |
|
| 524 | 1x |
arma::mat nw_added(row_count / 2, num_nodes, arma::fill::zeros); |
| 525 | 1x |
arma::mat pts_added(row_count / 2, num_dims, arma::fill::zeros); |
| 526 | 4x |
for (int k = 0; k < row_count; k += 2) {
|
| 527 | 12x |
nw_added.row(k / 2) = nw.row(k) + nw.row(k + 1); |
| 528 | 12x |
pts_added.row(k / 2) = points.row(k) + points.row(k + 1); |
| 529 |
} |
|
| 530 | ||
| 531 | 1x |
arma::mat ssX(num_dims, num_nodes, arma::fill::zeros); |
| 532 | 1x |
arma::mat ssA = nw_added.t() * nw_added; |
| 533 | 3x |
for (int i = 0; i < num_dims; i++) {
|
| 534 | 4x |
arma::mat ssb = nw_added.t() * pts_added.col(i); |
| 535 | 6x |
ssX.row(i) = qe::linalg::solve_spd(ssA, ssb).t(); |
| 536 |
} |
|
| 537 | ||
| 538 | 1x |
NodePositions r; |
| 539 | 1x |
r.nodes = ssX.t(); |
| 540 | 1x |
r.centroids = (ssX * nw.t()).t(); |
| 541 | 1x |
r.weights = nw; |
| 542 | 1x |
r.points = points; |
| 543 | 1x |
return r; |
| 544 |
} |
|
| 545 | ||
| 546 | 1x |
arma::mat ssX(num_dims, num_nodes, arma::fill::zeros); |
| 547 | 1x |
arma::mat ssA = nw.t() * nw; |
| 548 | 3x |
for (int i = 0; i < num_dims; i++) {
|
| 549 | 4x |
arma::mat ssb = nw.t() * points.col(i); |
| 550 | 6x |
ssX.row(i) = qe::linalg::solve_spd(ssA, ssb).t(); |
| 551 |
} |
|
| 552 | ||
| 553 | 1x |
NodePositions r; |
| 554 | 1x |
r.nodes = ssX.t(); |
| 555 | 1x |
r.centroids = (ssX * nw.t()).t(); |
| 556 | 1x |
r.weights = nw; |
| 557 | 1x |
r.points = points; |
| 558 | 1x |
return r; |
| 559 |
} |
|
| 560 | ||
| 561 |
/// @} |
|
| 562 | ||
| 563 |
} // namespace qe |
|
| 564 | ||
| 565 |
#endif // LIBQE_MODELING_HPP |
| 1 |
/** |
|
| 2 |
* @file normalization.hpp |
|
| 3 |
* @brief Row-wise normalization utilities for ENA connection networks. |
|
| 4 |
*/ |
|
| 5 |
#ifndef LIBQE_NORMALIZATION_HPP |
|
| 6 |
#define LIBQE_NORMALIZATION_HPP |
|
| 7 | ||
| 8 |
#include <armadillo> |
|
| 9 |
#include <cmath> |
|
| 10 | ||
| 11 |
namespace qe {
|
|
| 12 | ||
| 13 |
/** |
|
| 14 |
* @brief Normalize each row to unit L2 norm (project onto the unit hypersphere). |
|
| 15 |
* |
|
| 16 |
* Rows whose L2 norm is zero are left unchanged. |
|
| 17 |
* |
|
| 18 |
* @param m Input matrix (n_units × n_connections), row-major. |
|
| 19 |
* @returns Matrix of the same shape with each row divided by its L2 norm. |
|
| 20 |
* @note Equivalent to @c fun_sphere_norm() in rENA/ena.cpp. |
|
| 21 |
*/ |
|
| 22 | 3x |
inline arma::mat normalize_networks(arma::mat m) {
|
| 23 | 3x |
arma::mat out(m.n_rows, m.n_cols, arma::fill::zeros); |
| 24 | 11x |
for (arma::uword r = 0; r < m.n_rows; r++) {
|
| 25 | 8x |
double len = arma::norm(m.row(r), 2); |
| 26 | 15x |
if (len > 0.0) out.row(r) = m.row(r) / len; |
| 27 |
} |
|
| 28 | 3x |
return out; |
| 29 |
} |
|
| 30 | ||
| 31 |
/** |
|
| 32 |
* @brief Scale every entry by the reciprocal of the largest row L2 norm. |
|
| 33 |
* |
|
| 34 |
* Preserves relative magnitudes across rows; does not normalise each row |
|
| 35 |
* individually. If the largest row norm is zero, the matrix is returned |
|
| 36 |
* unchanged. |
|
| 37 |
* |
|
| 38 |
* @param m Input matrix (n_units × n_connections), modified in place. |
|
| 39 |
* @returns Scaled matrix (same shape as @p m). |
|
| 40 |
* @note Equivalent to @c fun_skip_sphere_norm() in rENA/ena.cpp. |
|
| 41 |
*/ |
|
| 42 | 2x |
inline arma::mat scale_networks(arma::mat m) {
|
| 43 | 2x |
double largest = 0.0; |
| 44 | 7x |
for (arma::uword r = 0; r < m.n_rows; r++) {
|
| 45 | 5x |
largest = std::max(largest, arma::norm(m.row(r), 2)); |
| 46 |
} |
|
| 47 | 2x |
if (largest > 0.0) m /= largest; |
| 48 | 4x |
return m; |
| 49 |
} |
|
| 50 | ||
| 51 |
} // namespace qe |
|
| 52 | ||
| 53 |
#endif // LIBQE_NORMALIZATION_HPP |
| 1 |
/** |
|
| 2 |
* @file rotation.hpp |
|
| 3 |
* @brief Rotation routines for Epistemic Network Analysis (ENA): |
|
| 4 |
* SVD, deflation, orthogonal-SVD, complete (generalized), and |
|
| 5 |
* means rotation. All routines are designed to match the |
|
| 6 |
* numerical output of the reference R implementation in rENA. |
|
| 7 |
*/ |
|
| 8 | ||
| 9 |
#ifndef LIBQE_ROTATION_HPP |
|
| 10 |
#define LIBQE_ROTATION_HPP |
|
| 11 | ||
| 12 |
#include "linalg_fallback.hpp" |
|
| 13 |
#include <armadillo> |
|
| 14 |
#include <stdexcept> |
|
| 15 |
#include <string> |
|
| 16 |
#include <utility> |
|
| 17 |
#include <vector> |
|
| 18 | ||
| 19 |
namespace qe {
|
|
| 20 | ||
| 21 |
/// @name Return types |
|
| 22 |
/// @{
|
|
| 23 | ||
| 24 |
/** |
|
| 25 |
* @brief Aggregated result returned by every rotation routine. |
|
| 26 |
* |
|
| 27 |
* The layout mirrors the output of R's `prcomp()`: `rotation` holds the |
|
| 28 |
* right-singular vectors (principal axes), `eigenvalues` holds the |
|
| 29 |
* explained-variance values (sdev^2 in R parlance), and `column_names` |
|
| 30 |
* carries the human-readable axis labels used downstream for network plots |
|
| 31 |
* and loadings tables. |
|
| 32 |
*/ |
|
| 33 |
struct RotationResult {
|
|
| 34 |
arma::mat rotation; ///< p x p orthogonal matrix; column j is rotation axis j. |
|
| 35 |
arma::vec eigenvalues; ///< Length-p vector of sdev^2 values, matching rENA's convention. |
|
| 36 |
std::vector<std::string> column_names; ///< Axis labels, e.g. "MR1", "SVD2", "GMR1". |
|
| 37 |
}; |
|
| 38 | ||
| 39 |
/** |
|
| 40 |
* @brief A pair of 0-based row-index vectors identifying two groups within a |
|
| 41 |
* points matrix. |
|
| 42 |
* |
|
| 43 |
* Used by means_rotation() to specify which rows belong to group A and which |
|
| 44 |
* belong to group B for each successive mean-difference axis. |
|
| 45 |
*/ |
|
| 46 |
struct GroupPair {
|
|
| 47 |
arma::uvec a; ///< Row indices of group A (0-based). |
|
| 48 |
arma::uvec b; ///< Row indices of group B (0-based). |
|
| 49 |
}; |
|
| 50 | ||
| 51 |
/// @} |
|
| 52 | ||
| 53 |
/// @name SVD rotation |
|
| 54 |
/// @{
|
|
| 55 | ||
| 56 |
/** |
|
| 57 |
* @brief Compute a full SVD rotation of a (pre-centered) points matrix. |
|
| 58 |
* |
|
| 59 |
* Produces the right-singular vectors V as the rotation matrix and scales |
|
| 60 |
* each singular value to an eigenvalue (sdev^2) compatible with rENA's |
|
| 61 |
* `prcomp()` output. |
|
| 62 |
* |
|
| 63 |
* @param points n x p data matrix. The caller is responsible for |
|
| 64 |
* centering upstream; this function applies no centering. |
|
| 65 |
* |
|
| 66 |
* @returns A RotationResult where: |
|
| 67 |
* - `rotation` = V (p x p full SVD; trailing columns span the null |
|
| 68 |
* space when the matrix is rank-deficient). |
|
| 69 |
* - `eigenvalues[j]` = s[j]^2 / max(1, n - 1) (== sdev^2 in R). |
|
| 70 |
* - `column_names` = {"SVD1", "SVD2", ..., "SVDp"}.
|
|
| 71 |
* |
|
| 72 |
* @note Equivalent to `prcomp(points, retx=FALSE, scale=FALSE, |
|
| 73 |
* center=FALSE, tol=0)` in rENA/R. |
|
| 74 |
* |
|
| 75 |
* @note Sign convention: none. Signs follow the underlying LAPACK SVD, |
|
| 76 |
* matching rENA's long-standing behavior. A deterministic sign rule |
|
| 77 |
* (e.g. svd_flip) may be added later as an opt-in flag. |
|
| 78 |
*/ |
|
| 79 | 39x |
inline RotationResult ena_svd(const arma::mat& points) {
|
| 80 | 39x |
arma::mat U, V; |
| 81 | 39x |
arma::vec s; |
| 82 | 39x |
qe::linalg::svd(U, s, V, points); |
| 83 | ||
| 84 | 39x |
const arma::uword p = points.n_cols; |
| 85 | 78x |
const double denom = points.n_rows > 1 |
| 86 |
? static_cast<double>(points.n_rows - 1) |
|
| 87 |
: 1.0; |
|
| 88 | ||
| 89 | 39x |
arma::vec eigenvalues(p, arma::fill::zeros); |
| 90 | 39x |
const arma::uword k = std::min<arma::uword>(s.n_elem, p); |
| 91 | 195x |
for (arma::uword j = 0; j < k; ++j) {
|
| 92 | 468x |
eigenvalues(j) = (s(j) * s(j)) / denom; |
| 93 |
} |
|
| 94 | ||
| 95 | 39x |
std::vector<std::string> labels(p); |
| 96 | 195x |
for (arma::uword j = 0; j < p; ++j) {
|
| 97 | 156x |
labels[j] = "SVD" + std::to_string(j + 1); |
| 98 |
} |
|
| 99 | 39x |
return {V, eigenvalues, std::move(labels)};
|
| 100 |
} |
|
| 101 | ||
| 102 |
/// @} |
|
| 103 | ||
| 104 |
/// @name Deflation |
|
| 105 |
/// @{
|
|
| 106 | ||
| 107 |
/** |
|
| 108 |
* @brief Project a data matrix onto the hyperplane orthogonal to a given axis. |
|
| 109 |
* |
|
| 110 |
* Computes `data - (data * axis) * axis^T`, effectively removing the |
|
| 111 |
* component of every row of `data` that lies along `axis`. |
|
| 112 |
* |
|
| 113 |
* @param data n x p data matrix. |
|
| 114 |
* @param axis Unit-norm column vector of length p. The caller is |
|
| 115 |
* responsible for normalizing `axis` before calling this |
|
| 116 |
* function; no normalization is performed internally. |
|
| 117 |
* |
|
| 118 |
* @returns n x p matrix with the projection onto `axis` subtracted out. |
|
| 119 |
*/ |
|
| 120 | 16x |
inline arma::mat deflate(const arma::mat& data, const arma::vec& axis) {
|
| 121 | 32x |
return data - (data * axis) * axis.t(); |
| 122 |
} |
|
| 123 | ||
| 124 |
/// @} |
|
| 125 | ||
| 126 |
/// @name Orthogonal SVD rotation |
|
| 127 |
/// @{
|
|
| 128 | ||
| 129 |
/** |
|
| 130 |
* @brief Orthonormalize named axes via QR, then complete the rotation with |
|
| 131 |
* an SVD of the data projected onto the orthogonal complement. |
|
| 132 |
* |
|
| 133 |
* The named axes in `weights` are orthonormalized via a full QR |
|
| 134 |
* decomposition. The first k columns of the resulting Q become the |
|
| 135 |
* "fixed" part of the rotation; the remaining p - k columns define a |
|
| 136 |
* complementary subspace onto which `data` is projected, and a further SVD |
|
| 137 |
* fills those trailing rotation columns. |
|
| 138 |
* |
|
| 139 |
* @param data n x p data matrix (caller-centered if required). |
|
| 140 |
* @param weights p x k matrix of named axis directions. Column norms |
|
| 141 |
* need not be 1; QR handles normalization internally. |
|
| 142 |
* @param named_labels Length-k vector of labels for the first k rotation |
|
| 143 |
* axes. Trailing axes receive labels "SVD{k+1}" ..
|
|
| 144 |
* "SVDp". |
|
| 145 |
* |
|
| 146 |
* @returns A RotationResult containing the combined p x p rotation, |
|
| 147 |
* eigenvalues for the SVD-filled trailing axes (indices k..p-1; |
|
| 148 |
* the first k eigenvalues are zero), and axis labels. |
|
| 149 |
* |
|
| 150 |
* @note Equivalent to `orthogonal_svd()` in rENA/R/ena.rotate.by.mean.R: |
|
| 151 |
* @code |
|
| 152 |
* Q = qr.Q(qr(weights), complete = TRUE) # p x p |
|
| 153 |
* X_bar = data %*% Q[, (k+1):p] # n x (p - k) |
|
| 154 |
* V = prcomp(X_bar)$rotation |
|
| 155 |
* out = cbind(Q[, 1:k], Q[, (k+1):p] %*% V) |
|
| 156 |
* @endcode |
|
| 157 |
* |
|
| 158 |
* @note The named axes in the OUTPUT are the orthonormalized Q columns, NOT |
|
| 159 |
* the original `weights` columns. Use complete_rotation() if you need |
|
| 160 |
* to preserve input axes verbatim. |
|
| 161 |
* |
|
| 162 |
* @throws std::runtime_error if `data.n_cols != weights.n_rows` or |
|
| 163 |
* `named_labels.size() != weights.n_cols`. |
|
| 164 |
*/ |
|
| 165 | 10x |
inline RotationResult orthogonal_svd( |
| 166 |
const arma::mat& data, |
|
| 167 |
const arma::mat& weights, |
|
| 168 |
const std::vector<std::string>& named_labels) {
|
|
| 169 | 10x |
const arma::uword p = weights.n_rows; |
| 170 | 10x |
const arma::uword k = weights.n_cols; |
| 171 | 10x |
if (data.n_cols != p) {
|
| 172 |
throw std::runtime_error( |
|
| 173 | 1x |
"orthogonal_svd: data.n_cols must equal weights.n_rows"); |
| 174 |
} |
|
| 175 | 9x |
if (named_labels.size() != k) {
|
| 176 |
throw std::runtime_error( |
|
| 177 | 1x |
"orthogonal_svd: named_labels.size() must equal weights.n_cols"); |
| 178 |
} |
|
| 179 | 8x |
if (k == 0) {
|
| 180 | ! |
return ena_svd(data); |
| 181 |
} |
|
| 182 | ||
| 183 | 8x |
arma::mat Q, R; |
| 184 | 8x |
qe::linalg::qr_full(Q, R, weights); // Q is p x p |
| 185 | ||
| 186 | 8x |
arma::mat rotation(p, p); |
| 187 | 8x |
arma::vec eigenvalues(p, arma::fill::zeros); |
| 188 | 24x |
rotation.cols(0, k - 1) = Q.cols(0, k - 1); |
| 189 | ||
| 190 | 8x |
if (k < p) {
|
| 191 | 16x |
arma::mat Q_comp = Q.cols(k, p - 1); // p x (p - k) |
| 192 | 8x |
arma::mat X_bar = data * Q_comp; // n x (p - k) |
| 193 | 8x |
RotationResult inner = ena_svd(X_bar); |
| 194 | 16x |
rotation.cols(k, p - 1) = Q_comp * inner.rotation; |
| 195 | 32x |
for (arma::uword j = 0; j < (p - k); ++j) {
|
| 196 | 48x |
eigenvalues(k + j) = inner.eigenvalues(j); |
| 197 |
} |
|
| 198 |
} |
|
| 199 | ||
| 200 | 8x |
std::vector<std::string> labels(p); |
| 201 | 22x |
for (arma::uword j = 0; j < k; ++j) labels[j] = named_labels[j]; |
| 202 | 32x |
for (arma::uword j = k; j < p; ++j) {
|
| 203 | 24x |
labels[j] = "SVD" + std::to_string(j + 1); |
| 204 |
} |
|
| 205 | 16x |
return {rotation, eigenvalues, std::move(labels)};
|
| 206 |
} |
|
| 207 | ||
| 208 |
/// @} |
|
| 209 | ||
| 210 |
/// @name Complete (generalized) rotation |
|
| 211 |
/// @{
|
|
| 212 | ||
| 213 |
/** |
|
| 214 |
* @brief Keep named axes verbatim and fill remaining axes from an SVD of the |
|
| 215 |
* data after parallel deflation by all named axes. |
|
| 216 |
* |
|
| 217 |
* Unlike orthogonal_svd(), this routine preserves the caller-supplied |
|
| 218 |
* `named_axes` exactly (no QR re-orthonormalization). The complementary |
|
| 219 |
* axes are obtained by deflating `data` simultaneously by all named axes and |
|
| 220 |
* then running ena_svd() on the result. |
|
| 221 |
* |
|
| 222 |
* The deflation is **parallel**: every projection is subtracted from the |
|
| 223 |
* original `data`, not from a progressively deflated copy: |
|
| 224 |
* @code |
|
| 225 |
* defA = data - data * (named_axes * named_axes^T) |
|
| 226 |
* @endcode |
|
| 227 |
* This matches `defA <- A - A %*% v1 %*% t(v1) - A %*% v2 %*% t(v2)` in |
|
| 228 |
* rENA line-for-line. For mutually orthogonal axes the result is identical |
|
| 229 |
* to sequential deflation; for non-orthogonal axes it differs. |
|
| 230 |
* |
|
| 231 |
* @param data n x p data matrix. |
|
| 232 |
* @param named_axes p x k matrix of named rotation axes. Each column |
|
| 233 |
* must be unit-norm; orthonormality between columns is |
|
| 234 |
* NOT required. |
|
| 235 |
* @param named_labels Length-k vector of labels for the first k axes. |
|
| 236 |
* Trailing axes receive labels "SVD{k+1}" .. "SVDp".
|
|
| 237 |
* Conventional labels for generalized rotation are |
|
| 238 |
* "GMR1", "GMR2", then "SVD{k+1}" onward, but labels
|
|
| 239 |
* are entirely caller-supplied. |
|
| 240 |
* |
|
| 241 |
* @returns A RotationResult where columns 0..k-1 of `rotation` equal |
|
| 242 |
* `named_axes` verbatim and columns k..p-1 come from the SVD of |
|
| 243 |
* the deflated data. |
|
| 244 |
* |
|
| 245 |
* @note Equivalent to the tail of `ena.rotate.by.generalized` in rENA |
|
| 246 |
* (canonical version: commit 2c079126 on rENA `origin/main`). |
|
| 247 |
* |
|
| 248 |
* @throws std::runtime_error if `data.n_cols != named_axes.n_rows` or |
|
| 249 |
* `named_labels.size() != named_axes.n_cols`. |
|
| 250 |
*/ |
|
| 251 | 16x |
inline RotationResult complete_rotation( |
| 252 |
const arma::mat& data, |
|
| 253 |
const arma::mat& named_axes, |
|
| 254 |
const std::vector<std::string>& named_labels) {
|
|
| 255 | 16x |
const arma::uword p = named_axes.n_rows; |
| 256 | 16x |
const arma::uword k = named_axes.n_cols; |
| 257 | 16x |
if (data.n_cols != p) {
|
| 258 |
throw std::runtime_error( |
|
| 259 | ! |
"complete_rotation: data.n_cols must equal named_axes.n_rows"); |
| 260 |
} |
|
| 261 | 16x |
if (named_labels.size() != k) {
|
| 262 |
throw std::runtime_error( |
|
| 263 | ! |
"complete_rotation: named_labels.size() must equal named_axes.n_cols"); |
| 264 |
} |
|
| 265 | 16x |
if (k == 0) {
|
| 266 | ! |
return ena_svd(data); |
| 267 |
} |
|
| 268 | ||
| 269 | 32x |
arma::mat defA = data - data * named_axes * named_axes.t(); |
| 270 | 16x |
RotationResult inner = ena_svd(defA); |
| 271 | ||
| 272 | 16x |
arma::mat rotation(p, p); |
| 273 | 16x |
arma::vec eigenvalues(p, arma::fill::zeros); |
| 274 | 32x |
rotation.cols(0, k - 1) = named_axes; |
| 275 | 16x |
if (k < p) {
|
| 276 | 48x |
rotation.cols(k, p - 1) = inner.rotation.cols(0, p - k - 1); |
| 277 | 53x |
for (arma::uword j = 0; j < (p - k); ++j) {
|
| 278 | 74x |
eigenvalues(k + j) = inner.eigenvalues(j); |
| 279 |
} |
|
| 280 |
} |
|
| 281 | ||
| 282 | 16x |
std::vector<std::string> labels(p); |
| 283 | 47x |
for (arma::uword j = 0; j < k; ++j) labels[j] = named_labels[j]; |
| 284 | 53x |
for (arma::uword j = k; j < p; ++j) {
|
| 285 | 37x |
labels[j] = "SVD" + std::to_string(j + 1); |
| 286 |
} |
|
| 287 | 32x |
return {rotation, eigenvalues, std::move(labels)};
|
| 288 |
} |
|
| 289 | ||
| 290 |
/// @} |
|
| 291 | ||
| 292 |
/// @name Means rotation |
|
| 293 |
/// @{
|
|
| 294 | ||
| 295 |
/** |
|
| 296 |
* @brief Compute a means rotation from one or more group pairs. |
|
| 297 |
* |
|
| 298 |
* For each GroupPair, computes the normalized mean-difference vector on the |
|
| 299 |
* progressively deflated data and stacks the resulting unit-norm axes into a |
|
| 300 |
* weights matrix. The final rotation is completed by orthogonal_svd(). |
|
| 301 |
* The input `points` is mean-centered before any axes are computed, matching |
|
| 302 |
* rENA's `scale(data, scale=FALSE, center=TRUE)`. |
|
| 303 |
* |
|
| 304 |
* @param points n x p matrix of network positions (raw, not pre-centered). |
|
| 305 |
* @param pairs Ordered list of GroupPair objects. Each pair contributes |
|
| 306 |
* one "MR" axis in sequence. Must be non-empty. |
|
| 307 |
* |
|
| 308 |
* @returns A RotationResult with `column_names` = {"MR1", ..., "MRm",
|
|
| 309 |
* "SVD{m+1}", ..., "SVDp"} where m = pairs.size().
|
|
| 310 |
* |
|
| 311 |
* @note Equivalent to `ena.rotate.by.mean()` in rENA/R/ena.rotate.by.mean.R. |
|
| 312 |
* The progressive deflation (each axis is computed on the data after |
|
| 313 |
* removing all previous axes) is the key numerical step. |
|
| 314 |
* |
|
| 315 |
* @warning There is NO guard against a zero-norm mean-difference vector. |
|
| 316 |
* If the two group means are identical on the current deflated data, |
|
| 317 |
* the axis will contain NaN values (division by zero). This mirrors |
|
| 318 |
* a latent bug in rENA and is preserved verbatim for numerical |
|
| 319 |
* fidelity; a safe opt-in guard will be added in a future release. |
|
| 320 |
* |
|
| 321 |
* @throws std::runtime_error if `pairs` is empty. |
|
| 322 |
*/ |
|
| 323 | 8x |
inline RotationResult means_rotation(const arma::mat& points, |
| 324 |
const std::vector<GroupPair>& pairs) {
|
|
| 325 | 8x |
if (pairs.empty()) {
|
| 326 | 1x |
throw std::runtime_error("Unable to rotate without 2 groups.");
|
| 327 |
} |
|
| 328 | ||
| 329 | 14x |
arma::mat data = points.each_row() - arma::mean(points, 0); |
| 330 | 7x |
arma::mat deflated = data; |
| 331 | 7x |
arma::mat weights(data.n_cols, pairs.size(), arma::fill::zeros); |
| 332 | 7x |
std::vector<std::string> labels; |
| 333 | 7x |
labels.reserve(pairs.size()); |
| 334 | ||
| 335 | 19x |
for (std::size_t i = 0; i < pairs.size(); ++i) {
|
| 336 | 24x |
const arma::rowvec mean_a = arma::mean(deflated.rows(pairs[i].a), 0); |
| 337 | 24x |
const arma::rowvec mean_b = arma::mean(deflated.rows(pairs[i].b), 0); |
| 338 | 24x |
const arma::vec diff = (mean_a - mean_b).t(); |
| 339 | 12x |
const double norm = std::sqrt(arma::accu(diff % diff)); |
| 340 | 12x |
const arma::vec axis = diff / norm; // no zero-norm guard |
| 341 | ||
| 342 | 12x |
deflated = deflate(deflated, axis); |
| 343 | 24x |
weights.col(i) = axis; |
| 344 | 12x |
labels.push_back("MR" + std::to_string(i + 1));
|
| 345 |
} |
|
| 346 | ||
| 347 | 14x |
return orthogonal_svd(deflated, weights, labels); |
| 348 |
} |
|
| 349 | ||
| 350 |
/// @} |
|
| 351 | ||
| 352 |
} // namespace qe |
|
| 353 | ||
| 354 |
#endif // LIBQE_ROTATION_HPP |
| 1 |
// [[Rcpp::depends(RcppArmadillo)]] |
|
| 2 |
#include <RcppArmadillo.h> |
|
| 3 |
#include <libqe/libqe.hpp> |
|
| 4 | ||
| 5 |
using namespace Rcpp; |
|
| 6 | ||
| 7 |
// ============================================================================= |
|
| 8 |
// Adjacency utilities |
|
| 9 |
// ============================================================================= |
|
| 10 | ||
| 11 |
//' Number of upper-triangle pairs for n codes |
|
| 12 |
//' @param n Number of codes |
|
| 13 |
//' @return Integer: n*(n-1)/2 |
|
| 14 |
//' @export |
|
| 15 |
// [[Rcpp::export]] |
|
| 16 | ! |
int choose_two(int n) {
|
| 17 | ! |
return qe::choose_two(n); |
| 18 |
} |
|
| 19 | ||
| 20 |
//' Upper-triangle index pairs |
|
| 21 |
//' @param len Number of codes (side length of square matrix) |
|
| 22 |
//' @param row -1 = both rows, 0 = row indices only, 1 = col indices only |
|
| 23 |
//' @export |
|
| 24 |
// [[Rcpp::export]] |
|
| 25 | 9x |
arma::umat connection_indices(int len, int row = -1) {
|
| 26 | 9x |
return qe::connection_indices(len, row); |
| 27 |
} |
|
| 28 | ||
| 29 |
//' Pairwise products → upper-triangle vector |
|
| 30 |
//' @param v Numeric vector of code values |
|
| 31 |
//' @export |
|
| 32 |
// [[Rcpp::export]] |
|
| 33 | 2x |
arma::rowvec code_connections(arma::mat v) {
|
| 34 | 2x |
return qe::code_connections(v); |
| 35 |
} |
|
| 36 | ||
| 37 |
//' Fold a directed (n*n) vector into an undirected upper-triangle vector |
|
| 38 |
//' @param v Numeric vector of length n*n |
|
| 39 |
//' @export |
|
| 40 |
// [[Rcpp::export]] |
|
| 41 | 4x |
arma::rowvec fold_directed_network(arma::vec v) {
|
| 42 | 4x |
return qe::fold_directed_network(v); |
| 43 |
} |
|
| 44 | ||
| 45 |
//' Flatten an adjacency matrix to a connection vector |
|
| 46 |
//' @param x Numeric matrix |
|
| 47 |
//' @param full TRUE = full n*n (directed); FALSE = upper triangle (undirected) |
|
| 48 |
//' @export |
|
| 49 |
// [[Rcpp::export]] |
|
| 50 | 2x |
arma::rowvec network_to_vector(arma::mat x, bool full = true) {
|
| 51 | 2x |
return qe::network_to_vector(x, full); |
| 52 |
} |
|
| 53 | ||
| 54 |
//' Code-name pairs for upper-triangle positions ("A & B")
|
|
| 55 |
//' @param v Character vector of code names |
|
| 56 |
//' @export |
|
| 57 |
// [[Rcpp::export]] |
|
| 58 | 2x |
std::vector<std::string> connection_names(std::vector<std::string> v) {
|
| 59 | 2x |
return qe::connection_names(v); |
| 60 |
} |
|
| 61 | ||
| 62 |
// ============================================================================= |
|
| 63 |
// Normalization |
|
| 64 |
// ============================================================================= |
|
| 65 | ||
| 66 |
//' Row-wise L2 (sphere) normalization |
|
| 67 |
//' @param m Numeric matrix |
|
| 68 |
//' @export |
|
| 69 |
// [[Rcpp::export]] |
|
| 70 | 3x |
arma::mat normalize_networks(arma::mat m) {
|
| 71 | 3x |
return qe::normalize_networks(m); |
| 72 |
} |
|
| 73 | ||
| 74 |
//' Max-norm scaling (divide all rows by the largest row L2 norm) |
|
| 75 |
//' @param m Numeric matrix |
|
| 76 |
//' @export |
|
| 77 |
// [[Rcpp::export]] |
|
| 78 | 2x |
arma::mat scale_networks(arma::mat m) {
|
| 79 | 2x |
return qe::scale_networks(m); |
| 80 |
} |
|
| 81 | ||
| 82 |
// ============================================================================= |
|
| 83 |
// Modeling |
|
| 84 |
// ============================================================================= |
|
| 85 | ||
| 86 |
//' Center points (subtract column means) |
|
| 87 |
//' @param values Numeric matrix |
|
| 88 |
//' @export |
|
| 89 |
// [[Rcpp::export]] |
|
| 90 | 2x |
arma::mat center_points(arma::mat values) {
|
| 91 | 2x |
return qe::center_points(values); |
| 92 |
} |
|
| 93 | ||
| 94 |
//' Confidence interval for the mean of a group of ENA unit points |
|
| 95 |
//' |
|
| 96 |
//' For each dimension computes a t-based CI: |
|
| 97 |
//' \code{mean ± t(α/2, n-1) × (SD / sqrt(n))} where \code{α = 1 - conf_level}.
|
|
| 98 |
//' |
|
| 99 |
//' @param points Numeric matrix (units x dims) — one row per unit in the group |
|
| 100 |
//' @param conf_level Confidence level (default 0.95) |
|
| 101 |
//' @return Numeric matrix (n_dims x 3): columns are [mean, ci_lower, ci_upper]. |
|
| 102 |
//' When \code{nrow(points) == 1} the CI bounds are \code{±Inf}.
|
|
| 103 |
//' @export |
|
| 104 |
// [[Rcpp::export]] |
|
| 105 | 14x |
arma::mat mean_ci(arma::mat points, double conf_level = 0.95) {
|
| 106 | 14x |
return qe::mean_ci(points, conf_level); |
| 107 |
} |
|
| 108 | ||
| 109 |
//' Outlier interval based on IQR (Tukey fence) for a group of ENA unit points |
|
| 110 |
//' |
|
| 111 |
//' For each dimension: \code{lower = -iqr_factor * IQR}, \code{upper = +iqr_factor * IQR}.
|
|
| 112 |
//' Equivalent to rENA's: |
|
| 113 |
//' \preformatted{
|
|
| 114 |
//' oi <- c(IQR(pts[,1]), ...) * iqr_factor |
|
| 115 |
//' matrix(rep(oi, 2), ncol = n_dims, byrow = TRUE) * c(-1, 1) |
|
| 116 |
//' } |
|
| 117 |
//' IQR uses R's default \code{type = 7} quantile.
|
|
| 118 |
//' |
|
| 119 |
//' @param points Numeric matrix (units x dims) — one row per unit in the group |
|
| 120 |
//' @param iqr_factor Multiplier applied to the IQR (default 1.5, the Tukey fence) |
|
| 121 |
//' @return Numeric matrix (n_dims x 2): columns are [lower, upper]. |
|
| 122 |
//' @export |
|
| 123 |
// [[Rcpp::export]] |
|
| 124 | 6x |
arma::mat outlier_ci(arma::mat points, double iqr_factor = 1.5) {
|
| 125 | 6x |
return qe::outlier_ci(points, iqr_factor); |
| 126 |
} |
|
| 127 | ||
| 128 |
//' Pearson correlation with CI between ENA points and centroids |
|
| 129 |
//' @param points Numeric matrix (units x dims) |
|
| 130 |
//' @param centroids Numeric matrix (units x dims) |
|
| 131 |
//' @param conf_level Confidence level (default 0.95) |
|
| 132 |
//' @export |
|
| 133 |
// [[Rcpp::export]] |
|
| 134 | 3x |
arma::mat ena_correlation(arma::mat points, arma::mat centroids, |
| 135 |
double conf_level = 0.95) {
|
|
| 136 | 3x |
return qe::ena_correlation(points, centroids, conf_level); |
| 137 |
} |
|
| 138 | ||
| 139 |
//' Least-squares node positions for undirected ENA |
|
| 140 |
//' @param adj_mats Numeric matrix of line weights (units x connections) |
|
| 141 |
//' @param t Numeric matrix of rotated points (units x dims) |
|
| 142 |
//' @param num_dims Number of dimensions |
|
| 143 |
//' @return List with nodes, centroids, weights, points |
|
| 144 |
//' @export |
|
| 145 |
// [[Rcpp::export]] |
|
| 146 | 2x |
List node_positions(arma::mat adj_mats, arma::mat t, int num_dims) {
|
| 147 | 2x |
qe::NodePositions r = qe::node_positions(adj_mats, t, num_dims); |
| 148 |
return List::create( |
|
| 149 | 4x |
_("nodes") = r.nodes,
|
| 150 | 4x |
_("centroids") = r.centroids,
|
| 151 | 4x |
_("weights") = r.weights,
|
| 152 | 4x |
_("points") = r.points
|
| 153 |
); |
|
| 154 |
} |
|
| 155 | ||
| 156 |
//' Least-squares node positions for directed ENA |
|
| 157 |
//' @param line_weights Numeric matrix (units x connections) |
|
| 158 |
//' @param points Numeric matrix of rotated points (units x dims) |
|
| 159 |
//' @param num_dims Number of dimensions |
|
| 160 |
//' @return List with nodes, centroids, weights, points |
|
| 161 |
//' @export |
|
| 162 |
// [[Rcpp::export]] |
|
| 163 | 1x |
List directed_node_positions(arma::mat line_weights, arma::mat points, |
| 164 |
int num_dims) {
|
|
| 165 | 1x |
qe::NodePositions r = qe::directed_node_positions(line_weights, points, num_dims); |
| 166 |
return List::create( |
|
| 167 | 2x |
_("nodes") = r.nodes,
|
| 168 | 2x |
_("centroids") = r.centroids,
|
| 169 | 2x |
_("weights") = r.weights,
|
| 170 | 2x |
_("points") = r.points
|
| 171 |
); |
|
| 172 |
} |
|
| 173 | ||
| 174 |
//' Directed node positions with paired ground+response rows combined |
|
| 175 |
//' @param line_weights Numeric matrix (units x connections) |
|
| 176 |
//' @param points Numeric matrix of rotated points (units x dims) |
|
| 177 |
//' @param num_dims Number of dimensions |
|
| 178 |
//' @return List with nodes, centroids, weights, points |
|
| 179 |
//' @export |
|
| 180 |
// [[Rcpp::export]] |
|
| 181 | 1x |
List directed_node_positions_combine_pairs(arma::mat line_weights, |
| 182 |
arma::mat points, |
|
| 183 |
int num_dims) {
|
|
| 184 |
qe::NodePositions r = qe::directed_node_positions( |
|
| 185 | 1x |
line_weights, points, num_dims, true); |
| 186 |
return List::create( |
|
| 187 | 2x |
_("nodes") = r.nodes,
|
| 188 | 2x |
_("centroids") = r.centroids,
|
| 189 | 2x |
_("weights") = r.weights,
|
| 190 | 2x |
_("points") = r.points
|
| 191 |
); |
|
| 192 |
} |
|
| 193 | ||
| 194 |
// ============================================================================= |
|
| 195 |
// Accumulation |
|
| 196 |
// ============================================================================= |
|
| 197 | ||
| 198 |
//' Core connection matrix for one ground+response pair |
|
| 199 |
//' @param ground Numeric row vector of ground (context) code values |
|
| 200 |
//' @param response Numeric row vector of response code values |
|
| 201 |
//' @param response_weight Scalar weight applied to the response self-connection |
|
| 202 |
//' @param ordered TRUE = directed; FALSE = undirected |
|
| 203 |
//' @export |
|
| 204 |
// [[Rcpp::export]] |
|
| 205 | 2x |
arma::mat connection_matrix(arma::rowvec ground, arma::rowvec response, |
| 206 |
double response_weight = 1.0, |
|
| 207 |
bool ordered = true) {
|
|
| 208 | 2x |
return qe::connection_matrix(ground, response, response_weight, ordered); |
| 209 |
} |
|
| 210 | ||
| 211 |
//' Traditional stanza-window accumulation (rENA model) |
|
| 212 |
//' |
|
| 213 |
//' For each row k in a single conversation's code matrix, accumulates |
|
| 214 |
//' connections over a back/forward window. |
|
| 215 |
//' |
|
| 216 |
//' \strong{Undirected} (\code{ordered = FALSE}, default): upper-triangle
|
|
| 217 |
//' co-occurrences with back/forward-reference corrections. |
|
| 218 |
//' Returns \code{n_rows × choose_two(n_codes)}.
|
|
| 219 |
//' |
|
| 220 |
//' \strong{Directed} (\code{ordered = TRUE}): focal row k as response,
|
|
| 221 |
//' sum of prior window rows as ground, using the directed connection formula. |
|
| 222 |
//' \code{window_forward} is ignored. Returns \code{n_rows × n_codes²}.
|
|
| 223 |
//' |
|
| 224 |
//' @param codes Numeric matrix (rows = lines, cols = codes) for ONE conversation |
|
| 225 |
//' @param window_back Number of prior lines in window (default 1); use .Machine$integer.max for Inf |
|
| 226 |
//' @param window_forward Number of subsequent lines (default 0; ignored when ordered = TRUE) |
|
| 227 |
//' @param binary If TRUE, binarise non-zero connection counts |
|
| 228 |
//' @param ordered If TRUE, return directed n² output; FALSE (default) returns upper-tri |
|
| 229 |
//' @return Undirected: (n_rows × choose_two(n_codes)); directed: (n_rows × n_codes²) |
|
| 230 |
//' @export |
|
| 231 |
// [[Rcpp::export]] |
|
| 232 | 10x |
arma::mat accumulate_stanza(arma::mat codes, |
| 233 |
int window_back = 1, |
|
| 234 |
int window_forward = 0, |
|
| 235 |
bool binary = true, |
|
| 236 |
bool ordered = false) {
|
|
| 237 | 10x |
return qe::accumulate_stanza(codes, window_back, window_forward, binary, ordered); |
| 238 |
} |
|
| 239 | ||
| 240 |
//' Compute a column-major linear index into a multi-dimensional array |
|
| 241 |
//' |
|
| 242 |
//' Equivalent to tma's flat_index(). Throws if lengths of `indices` |
|
| 243 |
//' and `dims` differ. |
|
| 244 |
//' |
|
| 245 |
//' @param indices 0-based integer vector of per-dimension indices |
|
| 246 |
//' @param dims Integer vector of array dimensions |
|
| 247 |
//' @return Scalar integer linear index |
|
| 248 |
//' @export |
|
| 249 |
// [[Rcpp::export]] |
|
| 250 | ! |
int flat_index(std::vector<int> indices, std::vector<int> dims) {
|
| 251 | ! |
return qe::flat_index(indices, dims); |
| 252 |
} |
|
| 253 | ||
| 254 |
//' Per-row upper-triangle co-occurrence matrix |
|
| 255 |
//' |
|
| 256 |
//' For each row in \code{codes}, computes the pairwise code products
|
|
| 257 |
//' (upper-triangle) and optionally binarizes. Output has |
|
| 258 |
//' \code{choose(n_codes, 2)} columns.
|
|
| 259 |
//' |
|
| 260 |
//' @param codes Numeric matrix (rows = observations, cols = codes) |
|
| 261 |
//' @param binary If TRUE, binarise non-zero co-occurrences (default TRUE) |
|
| 262 |
//' @return Numeric matrix (n_rows x choose_two(n_codes)) |
|
| 263 |
//' @export |
|
| 264 |
// [[Rcpp::export]] |
|
| 265 | 5x |
arma::mat row_connections(arma::mat codes, bool binary = true) {
|
| 266 | 5x |
return qe::row_connections(codes, binary); |
| 267 |
} |
|
| 268 | ||
| 269 |
//' Rolling backward window sum of a code matrix |
|
| 270 |
//' |
|
| 271 |
//' For each row k, sums the raw code values over rows |
|
| 272 |
//' \code{[max(0, k - window_size + 1), k]}. Returns a matrix of the same
|
|
| 273 |
//' shape as \code{codes}. \code{window_size <= 0} is treated as 1
|
|
| 274 |
//' (current row only). |
|
| 275 |
//' |
|
| 276 |
//' @param codes Numeric matrix (rows = observations, cols = codes) |
|
| 277 |
//' @param window_size Number of rows to look back (default 1) |
|
| 278 |
//' @return Numeric matrix (same dimensions as \code{codes})
|
|
| 279 |
//' @export |
|
| 280 |
// [[Rcpp::export]] |
|
| 281 | 4x |
arma::mat rolling_window_sum(arma::mat codes, int window_size = 1) {
|
| 282 | 4x |
return qe::rolling_window_sum(codes, window_size); |
| 283 |
} |
|
| 284 | ||
| 285 |
//' Ground/response accumulation for one unit (tma model) |
|
| 286 |
//' |
|
| 287 |
//' @param codes Numeric matrix for the full context (n_rows x n_codes) |
|
| 288 |
//' @param unit_rows 0-based integer vector of rows belonging to this unit |
|
| 289 |
//' @param decay_fn R function mapping a numeric distance vector to weights |
|
| 290 |
//' @param ordered TRUE = directed; FALSE = undirected (upper-tri) |
|
| 291 |
//' @return Numeric vector of connection counts |
|
| 292 |
//' @export |
|
| 293 |
// [[Rcpp::export]] |
|
| 294 | 3x |
arma::rowvec accumulate_unit(arma::mat codes, std::vector<int> unit_rows, |
| 295 |
Function decay_fn, bool ordered = false) {
|
|
| 296 | 7x |
auto cpp_decay = [&](arma::vec distances) -> arma::vec {
|
| 297 | 7x |
NumericVector d = wrap(distances); |
| 298 | 7x |
NumericVector w = decay_fn(d); |
| 299 | 14x |
return as<arma::vec>(w); |
| 300 |
}; |
|
| 301 | 3x |
return qe::accumulate_unit(codes, unit_rows, cpp_decay, ordered); |
| 302 |
} |
|
| 303 | ||
| 304 |
//' Ground/response accumulation for one unit — returns unit vector and per-row matrix |
|
| 305 |
//' |
|
| 306 |
//' Equivalent to tma's accumulate_network() but without R-env-var setup. |
|
| 307 |
//' The decay function receives a distance vector (response=0, older rows |
|
| 308 |
//' have larger values) and returns a weight vector of the same length. |
|
| 309 |
//' |
|
| 310 |
//' @param codes Numeric matrix (n_rows x n_codes) |
|
| 311 |
//' @param unit_rows 0-based integer vector of rows belonging to this unit |
|
| 312 |
//' @param decay_fn R function(distances) -> weights |
|
| 313 |
//' @param ordered TRUE = directed (full p^2); FALSE = undirected (upper-tri) |
|
| 314 |
//' @return List with `networks` (vector) and `row_networks` (matrix) |
|
| 315 |
//' @export |
|
| 316 |
// [[Rcpp::export]] |
|
| 317 | ! |
List accumulate_unit_with_rows(arma::mat codes, std::vector<int> unit_rows, |
| 318 |
Function decay_fn, bool ordered = false) {
|
|
| 319 | ! |
auto cpp_decay = [&](int unit_row, arma::uvec ground_indices) -> arma::vec {
|
| 320 | ! |
arma::vec dists(ground_indices.n_elem); |
| 321 | ! |
for (arma::uword k = 0; k < ground_indices.n_elem; ++k) |
| 322 | ! |
dists[k] = static_cast<double>(unit_row - ground_indices[k]); |
| 323 | ! |
return as<arma::vec>(wrap(decay_fn(wrap(dists)))); |
| 324 |
}; |
|
| 325 | ! |
qe::UnitNetworks r = qe::accumulate_unit_with_rows(codes, unit_rows, cpp_decay, ordered); |
| 326 |
return List::create( |
|
| 327 | ! |
_("networks") = r.networks,
|
| 328 | ! |
_("row_networks") = r.row_networks
|
| 329 |
); |
|
| 330 |
} |
|
| 331 | ||
| 332 |
//' Tensor-based multi-modal accumulation for one unit (tma model) |
|
| 333 |
//' |
|
| 334 |
//' Pure-C++ port of tma's apply_tensor(). Accepts 0-based index vectors |
|
| 335 |
//' for sender/receiver/mode dimensions and a pre-converted integer context |
|
| 336 |
//' lookup matrix. |
|
| 337 |
//' |
|
| 338 |
//' @param tensor Numeric vector (column-major flat tensor) |
|
| 339 |
//' @param dims Integer vector of tensor dimensions |
|
| 340 |
//' @param dims_sender 0-based sender axis indices |
|
| 341 |
//' @param dims_receiver 0-based receiver axis indices |
|
| 342 |
//' @param dims_mode 0-based mode axis indices |
|
| 343 |
//' @param context_lookup Integer matrix (n_context_rows x n_factors), 0-based |
|
| 344 |
//' @param unit_rows 0-based response-row indices for this unit |
|
| 345 |
//' @param codes Numeric matrix (n_context_rows x n_codes) |
|
| 346 |
//' @param times Numeric vector of timestamps per context row |
|
| 347 |
//' @param ordered TRUE = directed; FALSE = undirected |
|
| 348 |
//' @return List with `connection_counts` (vector) and `row_connection_counts` (matrix) |
|
| 349 |
//' @export |
|
| 350 |
// [[Rcpp::export]] |
|
| 351 | ! |
List apply_tensor(arma::vec tensor, |
| 352 |
std::vector<int> dims, |
|
| 353 |
std::vector<int> dims_sender, |
|
| 354 |
std::vector<int> dims_receiver, |
|
| 355 |
std::vector<int> dims_mode, |
|
| 356 |
arma::imat context_lookup, |
|
| 357 |
std::vector<int> unit_rows, |
|
| 358 |
arma::mat codes, |
|
| 359 |
arma::vec times, |
|
| 360 |
bool ordered = true) {
|
|
| 361 |
qe::TensorNetworks r = qe::apply_tensor_unit( |
|
| 362 |
tensor, dims, dims_sender, dims_receiver, dims_mode, |
|
| 363 | ! |
context_lookup, unit_rows, codes, times, ordered); |
| 364 |
return List::create( |
|
| 365 | ! |
_("connection_counts") = r.connection_counts,
|
| 366 | ! |
_("row_connection_counts") = r.row_connection_counts
|
| 367 |
); |
|
| 368 |
} |
|
| 369 | ||
| 370 |
// ============================================================================= |
|
| 371 |
// Rotation |
|
| 372 |
// ============================================================================= |
|
| 373 | ||
| 374 |
// Pack a RotationResult into the list shape that matches rENA's ENARotationSet |
|
| 375 |
// payload (rotation + eigenvalues + column names). Column names are attached |
|
| 376 |
// to the rotation matrix as dimnames so downstream R code can index by them. |
|
| 377 |
// Eigenvalues are returned as a plain numeric vector (not an Nx1 matrix) to |
|
| 378 |
// match what rENA's `pcaResults$sdev^2` produces. |
|
| 379 | 28x |
static List pack_rotation_result(const qe::RotationResult& r) {
|
| 380 | 28x |
NumericMatrix rotation = wrap(r.rotation); |
| 381 | 28x |
CharacterVector col_names(r.column_names.begin(), r.column_names.end()); |
| 382 | 84x |
rotation.attr("dimnames") = List::create(R_NilValue, col_names);
|
| 383 | 28x |
NumericVector eigenvalues(r.eigenvalues.begin(), r.eigenvalues.end()); |
| 384 |
return List::create( |
|
| 385 | 56x |
_("rotation") = rotation,
|
| 386 | 56x |
_("eigenvalues") = eigenvalues,
|
| 387 | 56x |
_("column_names") = col_names
|
| 388 |
); |
|
| 389 |
} |
|
| 390 | ||
| 391 |
//' SVD rotation (matches prcomp(retx=F, scale=F, center=F, tol=0)) |
|
| 392 |
//' |
|
| 393 |
//' Caller is responsible for centering upstream. Eigenvalues are stored as |
|
| 394 |
//' \code{sdev^2} (variance) to match rENA's \code{ena.svd}.
|
|
| 395 |
//' |
|
| 396 |
//' @param points Numeric matrix (n_units x n_dims) |
|
| 397 |
//' @return List with \code{rotation} (n_dims x n_dims), \code{eigenvalues}
|
|
| 398 |
//' (length n_dims, = sdev^2), and \code{column_names} ("SVD1", "SVD2", ...)
|
|
| 399 |
//' @export |
|
| 400 |
// [[Rcpp::export]] |
|
| 401 | 4x |
List ena_svd(arma::mat points) {
|
| 402 | 4x |
return pack_rotation_result(qe::ena_svd(points)); |
| 403 |
} |
|
| 404 | ||
| 405 |
//' Project a matrix onto the hyperplane orthogonal to a unit-norm axis |
|
| 406 |
//' |
|
| 407 |
//' Computes \code{data - (data \%*\% axis) \%*\% t(axis)}. The caller is
|
|
| 408 |
//' responsible for ensuring \code{axis} is unit-norm.
|
|
| 409 |
//' |
|
| 410 |
//' @param data Numeric matrix (n_units x n_dims) |
|
| 411 |
//' @param axis Numeric vector of length n_dims, unit-norm |
|
| 412 |
//' @return Numeric matrix of the same shape as \code{data}
|
|
| 413 |
//' @export |
|
| 414 |
// [[Rcpp::export]] |
|
| 415 | 4x |
arma::mat deflate(arma::mat data, arma::vec axis) {
|
| 416 | 4x |
return qe::deflate(data, axis); |
| 417 |
} |
|
| 418 | ||
| 419 |
//' Orthogonal SVD — orthonormalize named axes via QR, fill the rest from SVD |
|
| 420 |
//' |
|
| 421 |
//' Mirrors rENA's \code{orthogonal_svd()} in \code{ena.rotate.by.mean.R}:
|
|
| 422 |
//' the named axes in the output are the orthonormalized Q columns, not the |
|
| 423 |
//' original \code{weights} columns. Use \code{complete_rotation} to keep
|
|
| 424 |
//' the named axes verbatim. |
|
| 425 |
//' |
|
| 426 |
//' @param data Numeric matrix (n_units x n_dims) |
|
| 427 |
//' @param weights Numeric matrix (n_dims x k); columns are the named axes |
|
| 428 |
//' @param named_labels Character vector of length k |
|
| 429 |
//' @return List with \code{rotation}, \code{eigenvalues}, \code{column_names}
|
|
| 430 |
//' @export |
|
| 431 |
// [[Rcpp::export]] |
|
| 432 | 3x |
List orthogonal_svd(arma::mat data, |
| 433 |
arma::mat weights, |
|
| 434 |
std::vector<std::string> named_labels) {
|
|
| 435 | 3x |
return pack_rotation_result(qe::orthogonal_svd(data, weights, named_labels)); |
| 436 |
} |
|
| 437 | ||
| 438 |
//' Complete a rotation — keep named axes verbatim, fill remainder from SVD |
|
| 439 |
//' |
|
| 440 |
//' Mirrors the tail of \code{ena.rotate.by.generalized}: the named axes
|
|
| 441 |
//' appear in the output exactly as provided, and the trailing columns come |
|
| 442 |
//' from an SVD of the data deflated by all named axes. |
|
| 443 |
//' |
|
| 444 |
//' @param data Numeric matrix (n_units x n_dims) |
|
| 445 |
//' @param named_axes Numeric matrix (n_dims x k); columns must be unit-norm |
|
| 446 |
//' @param named_labels Character vector of length k |
|
| 447 |
//' @return List with \code{rotation}, \code{eigenvalues}, \code{column_names}
|
|
| 448 |
//' @export |
|
| 449 |
// [[Rcpp::export]] |
|
| 450 | 3x |
List complete_rotation(arma::mat data, |
| 451 |
arma::mat named_axes, |
|
| 452 |
std::vector<std::string> named_labels) {
|
|
| 453 | 3x |
return pack_rotation_result(qe::complete_rotation(data, named_axes, named_labels)); |
| 454 |
} |
|
| 455 | ||
| 456 |
//' Means rotation |
|
| 457 |
//' |
|
| 458 |
//' For each group pair, computes a normalized mean-difference axis on the |
|
| 459 |
//' progressively-deflated data and finishes with \code{orthogonal_svd}.
|
|
| 460 |
//' The input is column-centered first, matching rENA's |
|
| 461 |
//' \code{scale(data, scale=F, center=T)} at the top of \code{ena.rotate.by.mean}.
|
|
| 462 |
//' |
|
| 463 |
//' Each element of \code{group_pairs} is a length-2 list \code{list(a, b)}
|
|
| 464 |
//' of 0-based row indices into \code{points}.
|
|
| 465 |
//' |
|
| 466 |
//' @param points Numeric matrix (n_units x n_dims) |
|
| 467 |
//' @param group_pairs List of length k; each element is \code{list(a, b)}
|
|
| 468 |
//' where \code{a} and \code{b} are 0-based integer index vectors
|
|
| 469 |
//' @return List with \code{rotation}, \code{eigenvalues}, \code{column_names}
|
|
| 470 |
//' @export |
|
| 471 |
// [[Rcpp::export]] |
|
| 472 | 8x |
List means_rotation(arma::mat points, List group_pairs) {
|
| 473 | 8x |
std::vector<qe::GroupPair> pairs; |
| 474 | 8x |
pairs.reserve(group_pairs.size()); |
| 475 | 20x |
for (R_xlen_t i = 0; i < group_pairs.size(); ++i) {
|
| 476 | 12x |
List pair = group_pairs[i]; |
| 477 | 12x |
if (pair.size() != 2) {
|
| 478 | ! |
stop("group_pairs[[%d]] must be a length-2 list(a, b)",
|
| 479 | ! |
static_cast<int>(i + 1)); |
| 480 |
} |
|
| 481 | 12x |
IntegerVector ra = pair[0]; |
| 482 | 12x |
IntegerVector rb = pair[1]; |
| 483 | 12x |
arma::uvec a(ra.size()); |
| 484 | 12x |
arma::uvec b(rb.size()); |
| 485 | 217x |
for (R_xlen_t j = 0; j < ra.size(); ++j) {
|
| 486 | 410x |
a(j) = static_cast<arma::uword>(ra[j]); |
| 487 |
} |
|
| 488 | 237x |
for (R_xlen_t j = 0; j < rb.size(); ++j) {
|
| 489 | 450x |
b(j) = static_cast<arma::uword>(rb[j]); |
| 490 |
} |
|
| 491 | 12x |
pairs.push_back({a, b});
|
| 492 |
} |
|
| 493 | 15x |
return pack_rotation_result(qe::means_rotation(points, pairs)); |
| 494 |
} |
|
| 495 | ||
| 496 |
// ============================================================================= |
|
| 497 |
// Generalized Means Rotation (GMR) |
|
| 498 |
// ============================================================================= |
|
| 499 | ||
| 500 |
//' Generalized Means Rotation |
|
| 501 |
//' |
|
| 502 |
//' Computes a rotation axis that best represents a target variable's |
|
| 503 |
//' contribution to the ENA point space, after controlling for covariates via |
|
| 504 |
//' Lasso (coordinate-descent, k-fold CV). Mirrors |
|
| 505 |
//' \code{rENA::ena.rotate.by.generalized()}.
|
|
| 506 |
//' |
|
| 507 |
//' The caller is responsible for building \code{x_model_matrix} (e.g. via
|
|
| 508 |
//' \code{model.matrix()}) and identifying \code{x1_cols} (0-based indices of
|
|
| 509 |
//' the target variable's columns, which receive \code{penalty_factor = 0}).
|
|
| 510 |
//' Categorical targets should be encoded as 0-based integers. |
|
| 511 |
//' |
|
| 512 |
//' @param V Numeric matrix (n_units x n_connections) — ENA points. |
|
| 513 |
//' @param x_model_matrix Numeric matrix (n_units x p) — model matrix for x axis. |
|
| 514 |
//' @param x_target Numeric vector length n — target variable (raw float |
|
| 515 |
//' for numeric; 0-based integer codes for categorical). |
|
| 516 |
//' @param x1_cols Integer vector of 0-based column indices in |
|
| 517 |
//' \code{x_model_matrix} that belong to the target variable (unpenalized).
|
|
| 518 |
//' @param x_categorical Logical — TRUE if target is categorical. |
|
| 519 |
//' @param x_n_groups Integer — number of distinct groups (categorical only). |
|
| 520 |
//' @param x_subset Integer vector of 0-based row indices to subset for |
|
| 521 |
//' the x-axis GMR step (pass \code{integer(0)} to use all rows).
|
|
| 522 |
//' @param has_y Logical — TRUE to compute a second GMR axis. |
|
| 523 |
//' @param y_model_matrix Numeric matrix (n_units x p) — model matrix for y axis |
|
| 524 |
//' (ignored when \code{has_y = FALSE}).
|
|
| 525 |
//' @param y_target Numeric vector — y-axis target (ignored when |
|
| 526 |
//' \code{has_y = FALSE}).
|
|
| 527 |
//' @param y1_cols 0-based column indices for y target (ignored when |
|
| 528 |
//' \code{has_y = FALSE}).
|
|
| 529 |
//' @param y_categorical Logical (ignored when \code{has_y = FALSE}).
|
|
| 530 |
//' @param y_n_groups Integer (ignored when \code{has_y = FALSE}).
|
|
| 531 |
//' @param n_lambda Length of the Lasso lambda path (default 50). |
|
| 532 |
//' @param k_folds Cross-validation folds for lambda selection (default 5). |
|
| 533 |
//' @param lasso_eps \code{lambda_min = lasso_eps * lambda_max} (default 0.01).
|
|
| 534 |
//' @return List with \code{rotation}, \code{eigenvalues}, \code{column_names}.
|
|
| 535 |
//' @export |
|
| 536 |
// [[Rcpp::export]] |
|
| 537 | 13x |
List generalized_means_rotation( |
| 538 |
arma::mat V, |
|
| 539 |
arma::mat x_model_matrix, |
|
| 540 |
arma::vec x_target, |
|
| 541 |
IntegerVector x1_cols, |
|
| 542 |
bool x_categorical, |
|
| 543 |
int x_n_groups, |
|
| 544 |
IntegerVector x_subset, |
|
| 545 |
bool has_y, |
|
| 546 |
arma::mat y_model_matrix, |
|
| 547 |
arma::vec y_target, |
|
| 548 |
IntegerVector y1_cols, |
|
| 549 |
bool y_categorical, |
|
| 550 |
int y_n_groups, |
|
| 551 |
int n_lambda = 50, |
|
| 552 |
int k_folds = 5, |
|
| 553 |
double lasso_eps = 0.01 |
|
| 554 |
) {
|
|
| 555 |
// ── Unpack x1_cols ─────────────────────────────────────────────────────── |
|
| 556 | 13x |
arma::uvec x1(x1_cols.size()); |
| 557 | 26x |
for (R_xlen_t i = 0; i < x1_cols.size(); ++i) |
| 558 | 26x |
x1(i) = static_cast<arma::uword>(x1_cols[i]); |
| 559 | ||
| 560 |
// ── Unpack x_subset (empty IntegerVector → use all rows) ───────────────── |
|
| 561 | 13x |
arma::uvec x_sub; |
| 562 | 13x |
if (x_subset.size() > 0) {
|
| 563 | 2x |
x_sub.set_size(x_subset.size()); |
| 564 | 14x |
for (R_xlen_t i = 0; i < x_subset.size(); ++i) |
| 565 | 24x |
x_sub(i) = static_cast<arma::uword>(x_subset[i]); |
| 566 |
} |
|
| 567 | ||
| 568 |
// ── Unpack y1_cols ─────────────────────────────────────────────────────── |
|
| 569 | 13x |
arma::uvec y1(y1_cols.size()); |
| 570 | 15x |
for (R_xlen_t i = 0; i < y1_cols.size(); ++i) |
| 571 | 4x |
y1(i) = static_cast<arma::uword>(y1_cols[i]); |
| 572 | ||
| 573 |
// ── Build params ───────────────────────────────────────────────────────── |
|
| 574 | 13x |
qe::GeneralizedRotationParams p; |
| 575 | 13x |
p.x_model_matrix = x_model_matrix; |
| 576 | 13x |
p.x_target = x_target; |
| 577 | 13x |
p.x1_cols = x1; |
| 578 | 13x |
p.x_categorical = x_categorical; |
| 579 | 13x |
p.x_n_groups = static_cast<arma::uword>(x_n_groups); |
| 580 | 13x |
p.x_subset = x_sub; |
| 581 | 13x |
p.has_y = has_y; |
| 582 | 13x |
p.y_model_matrix = y_model_matrix; |
| 583 | 13x |
p.y_target = y_target; |
| 584 | 13x |
p.y1_cols = y1; |
| 585 | 13x |
p.y_categorical = y_categorical; |
| 586 | 13x |
p.y_n_groups = static_cast<arma::uword>(y_n_groups); |
| 587 | 13x |
p.n_lambda = n_lambda; |
| 588 | 13x |
p.k_folds = k_folds; |
| 589 | 13x |
p.lasso_eps = lasso_eps; |
| 590 | ||
| 591 | 26x |
return pack_rotation_result(qe::generalized_means_rotation(V, p)); |
| 592 |
} |