/* * This big file contains all the taucs routines needed for the * scilab interface onto the taucs supernodal multi-frontal choleski * solver : * - internal structures used by the solver * - genmmd interface * - create / free taucs matrices routines * - permutations routines * - symbolic analysis routines * - super nodal multi frontal factor routines * - super nodal multi frontal solve routines * - various utilities * * All this code is from different files of the TAUCS solver * lib from Sivan Toledo (see the TAUCS_license.txt file). * * Very minor modifs have been done (Bruno) : * - replace the _ for calling fortran routine by the * scilab macro F2C (call to genmmd and calls to blas * routines) * - the print messages that I have stayed are redirected * to the scilab window with sciprint * - I have added a small routine to get cnz * * Thanks to Sivan Toledo */ /*************************************************************/ /* */ /*************************************************************/ #include #include #include #include "taucs_scilab.h" #include "machine.h" /* F2C macro */ #include "MALLOC.h" #include "BOOL.h" #include "sciprint.h" #include "localization.h" #ifdef _MSC_VER #define alloca(x) MALLOC(x) #endif #define BLAS_FLOPS_CUTOFF 1000.0 #define SOLVE_DENSE_CUTOFF 5 extern int C2F(genmmd)(); extern int C2F(dpotrf)(); extern int C2F(dtrsm)(); extern int C2F(dsyrk)(); extern int C2F(dgemm)(); /*************************************************************/ /* structures */ /*************************************************************/ typedef struct { int sn_size; int n; int* rowind; int up_size; int* sn_vertices; int* up_vertices; double* f1; double* f2; double* u; } supernodal_frontal_matrix; typedef struct { char uplo; /* 'u' for upper, 'l' for lower, ' ' don't know; prefer lower. */ int n; /* size of matrix */ int n_sn; /* number of supernodes */ int* parent; /* supernodal elimination tree */ int* first_child; int* next_child; int* sn_size; /* size of supernodes (diagonal block) */ int* sn_up_size; /* size of subdiagonal update blocks */ int** sn_struct; /* row structure of supernodes */ int* sn_blocks_ld; /* lda of supernode blocks */ double** sn_blocks; /* supernode blocks */ int* up_blocks_ld; /* lda of update blocks */ double** up_blocks; /* update blocks */ } supernodal_factor_matrix; /*************************************************************/ /* for qsort */ /*************************************************************/ static int* compare_indirect_map; static int compare_indirect_ints(const void* vx, const void* vy) { int* ix = (int*)vx; int* iy = (int*)vy; if (compare_indirect_map[*ix] < compare_indirect_map[*iy]) { return -1; } if (compare_indirect_map[*ix] > compare_indirect_map[*iy]) { return 1; } return 0; } /*********************************************************/ /* Interface to MMD */ /*********************************************************/ void taucs_ccs_genmmd(taucs_ccs_matrix* m, int** perm, int** invperm) { int n, maxint, delta, nofsub; int* xadj; int* adjncy; int* invp; int* prm; int* dhead; int* qsize; int* llist; int* marker; int* len; int* next; int nnz, i, j, ip; if (!(m->flags & TAUCS_SYMMETRIC)) { sciprint("%s: %s", "taucs_ccs_genmmd", _("GENMMD ordering only works on symmetric matrices.\n")); *perm = NULL; *invperm = NULL; return; } /* this routine may actually work on UPPER as well */ if (!(m->flags & TAUCS_LOWER)) { sciprint("%s: %s", "taucs_ccs_genmmd", _("The lower part of the matrix must be represented.\n")); *perm = NULL; *invperm = NULL; return; } *perm = NULL; *invperm = NULL; n = m->n; nnz = (m->colptr)[n]; /* I copied the value of delta and the size of */ /* from SuperLU. Sivan */ delta = 1; /* DELTA is a parameter to allow the choice of nodes whose degree <= min-degree + DELTA. */ maxint = 32000; assert(sizeof(int) == 4); maxint = 2147483647; /* 2**31-1, for 32-bit only! */ xadj = (int*) MALLOC((n + 1) * sizeof(int)); adjncy = (int*) MALLOC((2 * nnz - n) * sizeof(int)); invp = (int*) MALLOC((n + 1) * sizeof(int)); prm = (int*) MALLOC(n * sizeof(int)); dhead = (int*) MALLOC((n + 1) * sizeof(int)); qsize = (int*) MALLOC((n + 1) * sizeof(int)); llist = (int*) MALLOC(n * sizeof(int)); marker = (int*) MALLOC(n * sizeof(int)); if (!xadj || !adjncy || !invp || !prm || !dhead || !qsize || !llist || !marker) { FREE(xadj ); FREE(adjncy); FREE(invp ); FREE(prm ); FREE(dhead ); FREE(qsize ); FREE(llist ); FREE(marker); return; } len = dhead; /* we reuse space */ next = qsize; /* we reuse space */ for (i = 0; i < n; i++) { len[i] = 0; } for (j = 0; j < n; j++) { for (ip = (m->colptr)[j]; ip < (m->colptr)[j + 1]; ip++) { i = (m->rowind)[ip]; if (i != j) { len[i] ++; len[j] ++; } } } xadj[0] = 1; for (i = 1; i <= n; i++) { xadj[i] = xadj[i - 1] + len[i - 1]; } /* use degree as a temporary */ for (i = 0; i < n; i++) { next[i] = xadj[i] - 1; } for (j = 0; j < n; j++) { for (ip = (m->colptr)[j]; ip < (m->colptr)[j + 1]; ip++) { i = (m->rowind)[ip]; assert( next[i] < 2 * nnz - n ); assert( next[j] < 2 * nnz - n ); if (i != j) { adjncy[ next[i] ] = j + 1; adjncy[ next[j] ] = i + 1; next[i] ++; next[j] ++; } } } F2C(genmmd)(&n, xadj, adjncy, invp, prm, &delta, dhead, qsize, llist, marker, &maxint, &nofsub); FREE(marker); FREE(llist ); FREE(qsize ); FREE(dhead ); FREE(xadj ); FREE(adjncy); for (i = 0; i < n; i++) { prm[i] --; } for (i = 0; i < n; i++) { invp[ prm[i] ] = i; } *perm = prm; *invperm = invp; } /*********************************************************/ /* CCS : create free taucs matrix routines */ /*********************************************************/ taucs_ccs_matrix* taucs_ccs_create(int m, int n, int nnz) { taucs_ccs_matrix* matrix; matrix = (taucs_ccs_matrix*) MALLOC(sizeof(taucs_ccs_matrix)); if (!matrix) { sciprint(_("%s: No more memory.\n"), "taucs_ccs_create"); return NULL; } matrix->flags = 0; matrix->n = n; matrix->m = m; matrix->colptr = (int*) MALLOC((n + 1) * sizeof(int)); matrix->rowind = (int*) MALLOC(nnz * sizeof(int)); matrix->values = (double*) MALLOC(nnz * sizeof(double)); if (!(matrix->colptr) || !(matrix->rowind)) { sciprint(_("%s: No more memory (n=%d, nnz=%d).\n"), "taucs_ccs_create", n, nnz); FREE(matrix->colptr); FREE(matrix->rowind); FREE(matrix->values); FREE (matrix); return NULL; } return matrix; } void taucs_ccs_free(taucs_ccs_matrix* matrix) { FREE(matrix->rowind); FREE(matrix->colptr); FREE(matrix->values); FREE(matrix); } /*********************************************************/ /* CCS : permutation routines */ /*********************************************************/ taucs_ccs_matrix* taucs_ccs_permute_symmetrically(taucs_ccs_matrix* A, int* perm, int* invperm) { taucs_ccs_matrix* PAPT; int n; int nnz; /* int* colptr; */ int* len; int i, j, ip, I, J; double AIJ; /*assert(A->flags & TAUCS_SYMMETRIC);*/ assert(A->flags & TAUCS_LOWER); n = A->n; nnz = (A->colptr)[n]; PAPT = taucs_ccs_create(n, n, nnz); /*PAPT->flags = TAUCS_SYMMETRIC | TAUCS_LOWER;*/ PAPT->flags = A->flags; len = (int*) MALLOC(n * sizeof(int)); //colptr = (int*) MALLOC(n * sizeof(int)); for (j = 0; j < n; j++) { len[j] = 0; } for (j = 0; j < n; j++) { for (ip = (A->colptr)[j]; ip < (A->colptr)[j + 1]; ip++) { /*i = (A->rowind)[ip] - (A->indshift);*/ i = (A->rowind)[ip]; I = invperm[i]; J = invperm[j]; if (I < J) { int T = I; J = T; } len[J] ++; } } (PAPT->colptr)[0] = 0; for (j = 1; j <= n; j++) { (PAPT->colptr)[j] = (PAPT->colptr)[j - 1] + len[j - 1]; } for (j = 0; j < n; j++) { len[j] = (PAPT->colptr)[j]; } for (j = 0; j < n; j++) { for (ip = (A->colptr)[j]; ip < (A->colptr)[j + 1]; ip++) { /*i = (A->rowind)[ip] - (A->indshift);*/ i = (A->rowind)[ip]; AIJ = (A->values)[ip]; I = invperm[i]; J = invperm[j]; if (I < J) { int T = I; I = J; J = T; } /*(PAPT->rowind)[ len[J] ] = I + (PAPT->indshift);*/ (PAPT->rowind)[ len[J] ] = I; (PAPT->values)[ len[J] ] = AIJ; len[J] ++; } } if (len) { FREE(len); len = NULL; } return PAPT; } void taucs_vec_permute(int n, double v[], double pv[], int p[]) { int i; for (i = 0; i < n; i++) { pv[i] = v[p[i]]; } } void taucs_vec_ipermute(int n, double pv[], double v[], int invp[]) { int i; for (i = 0; i < n; i++) { v[invp[i]] = pv[i]; } } /*************************************************************/ /* create and free the factor object */ /*************************************************************/ static supernodal_factor_matrix* multifrontal_supernodal_create(void) { supernodal_factor_matrix* L; L = (supernodal_factor_matrix*) MALLOC(sizeof(supernodal_factor_matrix)); if (!L) { return NULL; } L->uplo = 'l'; L->n = -1; /* unused */ L->sn_struct = NULL; L->sn_size = NULL; L->sn_up_size = NULL; L->parent = NULL; L->first_child = NULL; L->next_child = NULL; L->sn_blocks_ld = NULL; L->sn_blocks = NULL; L->up_blocks_ld = NULL; L->up_blocks = NULL; return L; } void taucs_supernodal_factor_free(void* vL) { supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL; int sn; FREE(L->parent); FREE(L->first_child); FREE(L->next_child); FREE(L->sn_size); FREE(L->sn_up_size); FREE(L->sn_blocks_ld); FREE(L->up_blocks_ld); if (L->sn_struct) for (sn = 0; sn < L->n_sn; sn++) { FREE(L->sn_struct[sn]); } if (L->sn_blocks) for (sn = 0; sn < L->n_sn; sn++) { FREE(L->sn_blocks[sn]); } if (L->up_blocks) for (sn = 0; sn < L->n_sn; sn++) { FREE(L->up_blocks[sn]); } /* for (sn=0; snn_sn; sn++) { FREE(L->sn_struct[sn]); FREE(L->sn_blocks[sn]); FREE(L->up_blocks[sn]); }*/ FREE(L->sn_struct); FREE(L->sn_blocks); FREE(L->up_blocks); FREE(L); } /*************************************************************/ /* create and free frontal matrices */ /*************************************************************/ static supernodal_frontal_matrix* supernodal_frontal_create(int* firstcol_in_supernode, int sn_size, int n, int* rowind) { supernodal_frontal_matrix* tmp; int iDontCheckf1 = 0; int iDontCheckf2 = 0; int iDontChecku = 0; tmp = (supernodal_frontal_matrix*)MALLOC(sizeof(supernodal_frontal_matrix)); if (tmp == NULL) { return NULL; } tmp->sn_size = sn_size; tmp->n = n; tmp->rowind = rowind; tmp->n = n; tmp->sn_size = sn_size; tmp->up_size = n - sn_size; tmp->sn_vertices = rowind; tmp->up_vertices = rowind + sn_size; if (tmp->sn_size > 0) { tmp->f1 = (double*)CALLOC((tmp->sn_size) * (tmp->sn_size), sizeof(double)); } else { iDontCheckf1 = 1; tmp->f1 = NULL; } if (tmp->sn_size > 0 && tmp->up_size > 0) { tmp->f2 = (double*)CALLOC((tmp->up_size) * (tmp->sn_size), sizeof(double)); } else { iDontCheckf2 = 1; tmp->f2 = NULL; } if (tmp->up_size > 0) { tmp->u = (double*)CALLOC((tmp->up_size) * (tmp->up_size), sizeof(double)); } else { iDontChecku = 1; tmp->u = NULL; } /*check allocation only if size is > 0 */ if ( (tmp->f1 == NULL && iDontCheckf1 == 0) || (tmp->f2 == NULL && iDontCheckf2 == 0) || (tmp->u == NULL && iDontChecku == 0)) { FREE(tmp->u); FREE(tmp->f1); FREE(tmp->f2); FREE(tmp); return NULL; } assert(tmp); return tmp; } static void supernodal_frontal_free(supernodal_frontal_matrix* to_del) { /* f1 and f2 are moved to the factor */ FREE(to_del->u); FREE(to_del); } /*************************************************************/ /* factor a frontal matrix */ /*************************************************************/ static int multifrontal_supernodal_front_factor(int sn, int* firstcol_in_supernode, int sn_size, taucs_ccs_matrix* A, supernodal_frontal_matrix* mtr, int* bitmap, supernodal_factor_matrix* snL) { int i, j; int* ind; double* re; int INFO = -1; double done = 1.0; double dminusone = -1.0; /* creating transform for real indices */ for (i = 0; i < mtr->sn_size; i++) { bitmap[mtr->sn_vertices[i]] = i; } for (i = 0; i < mtr->up_size; i++) { bitmap[mtr->up_vertices[i]] = mtr->sn_size + i; } /* adding sn_size column of A to first sn_size column of frontal matrix */ for (j = 0; j < (mtr->sn_size); j++) { ind = &(A->rowind[A->colptr[*(firstcol_in_supernode + j)]]); re = &(A->values[A->colptr[*(firstcol_in_supernode + j)]]); for (i = 0; i < A->colptr[*(firstcol_in_supernode + j) + 1] - A->colptr[*(firstcol_in_supernode + j)]; i++) { if (bitmap[ind[i]] < mtr->sn_size) { mtr->f1[ (mtr->sn_size)*j + bitmap[ind[i]]] += re[i]; } else { mtr->f2[ (mtr->up_size)*j + bitmap[ind[i]] - mtr->sn_size] += re[i]; } } } /* we use the BLAS through the Fortran interface */ /* solving of lower triangular system for L */ if (mtr->sn_size) { F2C(dpotrf)("LOWER", &(mtr->sn_size), mtr->f1, &(mtr->sn_size), &INFO); } if (INFO && INFO != -1) /* INFO = -1 if not initialized */ { sciprint(_(" CC^T Factorization: Matrix is not positive definite.\n")); sciprint(_(" nonpositive pivot in column %d\n"), mtr->sn_vertices[INFO - 1]); return -1; } /* getting completion for found columns of L */ if (mtr->up_size && mtr->sn_size) { F2C(dtrsm)("Right", "Lower", "Transpose", "No unit diagonal", &(mtr->up_size), &(mtr->sn_size), &done, mtr->f1, &(mtr->sn_size), mtr->f2, &(mtr->up_size)); } (snL->sn_blocks )[sn] = mtr->f1; (snL->sn_blocks_ld)[sn] = mtr->sn_size; (snL->up_blocks )[sn] = mtr->f2; (snL->up_blocks_ld)[sn] = mtr->up_size; /* computation of updated part of frontal matrix */ if (mtr->up_size && mtr->sn_size) { F2C(dsyrk)("Lower", "No Transpose", &(mtr->up_size), &(mtr->sn_size), &dminusone, mtr->f2, &(mtr->up_size), &done, mtr->u, &(mtr->up_size)); } return 0; } /*************************************************************/ /* extend-add */ /*************************************************************/ static void multifrontal_supernodal_front_extend_add( supernodal_frontal_matrix* parent_mtr, supernodal_frontal_matrix* my_mtr, int* bitmap) { int j, i, parent_i, parent_j; double v; for (i = 0; i < parent_mtr->sn_size; i++) { bitmap[parent_mtr->sn_vertices[i]] = i; } for (i = 0; i < parent_mtr->up_size; i++) { bitmap[parent_mtr->up_vertices[i]] = (parent_mtr->sn_size) + i; } /* extend add operation for update matrix */ for (j = 0; j < my_mtr->up_size; j++) { for (i = j; i < my_mtr->up_size; i++) { parent_j = bitmap[ my_mtr->up_vertices[j] ]; parent_i = bitmap[ my_mtr->up_vertices[i] ]; /* we could skip this if indices were sorted */ if (parent_j > parent_i) { int tmp = parent_j; parent_j = parent_i; parent_i = tmp; } v = (my_mtr->u)[(my_mtr->up_size) * j + i]; if (parent_j < parent_mtr->sn_size) { if (parent_i < parent_mtr->sn_size) { (parent_mtr->f1)[ (parent_mtr->sn_size)*parent_j + parent_i] += v; } else { (parent_mtr->f2)[ (parent_mtr->up_size)*parent_j + (parent_i - parent_mtr->sn_size)] += v; } } else { (parent_mtr->u)[ (parent_mtr->up_size) * (parent_j - parent_mtr->sn_size) + (parent_i - parent_mtr->sn_size)] += v; } } } } /*************************************************************/ /* symbolic elimination */ /*************************************************************/ /* UNION FIND ROUTINES */ static int uf_makeset(int* uf, int i) { uf[i] = i; return i; } static int uf_find (int* uf, int i) { if (uf[i] != i) { uf[i] = uf_find(uf, uf[i]); } return uf[i]; } static int uf_union (int* uf, int s, int t) { if (uf_find(uf, s) < uf_find(uf, t)) { uf[uf_find(uf, s)] = uf_find(uf, t); return (uf_find(uf, t)); } else { uf[uf_find(uf, s)] = uf_find(uf, t); return (uf_find(uf, t)); } } static void recursive_postorder(int j, int first_child[], int next_child[], int postorder[], int ipostorder[], int* next) { int c; for (c = first_child[j]; c != -1; c = next_child[c]) { recursive_postorder(c, first_child, next_child, postorder, ipostorder, next); } if (postorder) { postorder [*next] = j; } if (ipostorder) { ipostorder[j] = *next; } (*next)++; } #define GILBERT_NG_PEYTON_ANALYSIS_SUP /* in a few tests the supernodal version seemed slower */ #undef GILBERT_NG_PEYTON_ANALYSIS_SUP static int ordered_uf_makeset(int* uf, int i) { uf[i] = i; return i; } static int ordered_uf_find (int* uf, int i) { if (uf[i] != i) { uf[i] = uf_find(uf, uf[i]); } return uf[i]; } static int ordered_uf_union (int* uf, int s, int t) { assert(uf[t] == t); assert(uf[s] == s); assert(t > s); if (t > s) { uf[s] = t; return t; } else { uf[t] = s; } return s; } static void tree_level(int j, int isroot, int first_child[], int next_child[], int level[], int level_j) { int c; if (!isroot) { level[j] = level_j; } for (c = first_child[j]; c != -1; c = next_child[c]) { tree_level(c, FALSE, first_child, next_child, level, level_j + 1); } } static void tree_first_descendant(int j, int isroot, int first_child[], int next_child[], int ipostorder[], int first_descendant[]) { int c; int fd = ipostorder[j]; for (c = first_child[j]; c != -1; c = next_child[c]) { tree_first_descendant(c, FALSE, first_child, next_child, ipostorder, first_descendant); if (first_descendant[c] < fd) { fd = first_descendant[c]; } } if (!isroot) { first_descendant[j] = fd; } } int taucs_ccs_etree(taucs_ccs_matrix* A, int* parent, int* l_colcount, int* l_rowcount, int* l_nnz) { int* prev_p; int* level; int* l_cc; int* l_rc; int* wt; int n = A->n; int u, pprime; int ju; int* postorder; int* ipostorder; int *first_child, *next_child; int i, j, k, ip, jp, kp; int nnz, jnnz; int* uf; int* rowptr; int* colind; int* rowcount; int* realroot; /* we need the row structures for the lower triangle */ nnz = (A->colptr)[n]; uf = (int*)MALLOC(n * sizeof(int)); rowcount = (int*)MALLOC((n + 1) * sizeof(int)); rowptr = (int*)MALLOC((n + 1) * sizeof(int)); colind = (int*)MALLOC(nnz * sizeof(int)); for (i = 0; i <= n; i++) { rowcount[i] = 0; } for (j = 0; j < n; j++) { jnnz = (A->colptr)[j + 1] - (A->colptr)[j]; for (ip = 0; ip < jnnz; ip++) { i = (A->rowind)[(A->colptr)[j] + ip]; if (j < i) { rowcount[i]++; } } } ip = 0; for (i = 0; i <= n; i++) { int next_ip = ip + rowcount[i]; rowcount[i] = ip; rowptr [i] = ip; ip = next_ip; } for (j = 0; j < n; j++) { jnnz = (A->colptr)[j + 1] - (A->colptr)[j]; for (ip = 0; ip < jnnz; ip++) { i = (A->rowind)[(A->colptr)[j] + ip]; if (i == j) { continue; } assert( rowcount[i] < rowptr[i + 1] ); colind[ rowcount[i] ] = j; rowcount[i]++; } } /* now compute the etree */ { int u2, t, vroot; realroot = rowcount; /* reuse space */ for (i = 0; i < n; i++) { uf_makeset(uf, i); realroot[i] = i; parent[i] = n; vroot = i; for (kp = rowptr[i]; kp < rowptr[i + 1]; kp++) { k = colind[kp]; u2 = uf_find(uf, k); t = realroot[u2]; if (parent[t] == n && t != i) { parent[t] = i; vroot = uf_union(uf, vroot, u2); realroot[vroot] = i; } } } } FREE(colind); FREE(rowptr); FREE(rowcount); /* compute column counts */ if (l_colcount || l_rowcount || l_nnz) { int* l_nz; int tmp; int u2, p, q; first_child = (int*)MALLOC((n + 1) * sizeof(int)); next_child = (int*)MALLOC((n + 1) * sizeof(int)); postorder = (int*)MALLOC(n * sizeof(int)); ipostorder = (int*)MALLOC(n * sizeof(int)); wt = (int*)MALLOC(n * sizeof(int)); level = (int*)MALLOC(n * sizeof(int)); prev_p = (int*)MALLOC(n * sizeof(int)); #ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP prev_nbr = MALLOC(n * sizeof(int)); first_descendant = MALLOC(n * sizeof(int)); #endif /* compute the postorder */ for (j = 0; j <= n; j++) { first_child[j] = -1; } for (j = n - 1; j >= 0; j--) { next_child[j] = first_child[parent[j]]; first_child[parent[j]] = j; } { int next = 0; recursive_postorder(n, first_child, next_child, postorder, ipostorder, &next); } /* we allocate scratch vectors to avoid conditionals */ /* in the inner loop. */ if (l_colcount) { l_cc = l_colcount; } else { l_cc = (int*) MALLOC(n * sizeof(int)); } if (l_rowcount) { l_rc = l_rowcount; } else { l_rc = (int*) MALLOC(n * sizeof(int)); } if (l_nnz) { l_nz = l_nnz; } else { l_nz = &tmp; } /* sort by postorder of etree */ /* compute level, fst_desc */ tree_level(n, TRUE, first_child, next_child, level, -1); for (u = 0; u < n; u++) { prev_p [u] = -1; } for (u = 0; u < n; u++) { l_rc [u] = 1; } for (u = 0; u < n; u++) { ordered_uf_makeset(uf, u); } for (u = 0; u < n; u++) { if (first_child[u] == -1) { wt[u] = 1; /* leaves */ } else { wt[u] = 0; /* nonleaves */ } } #ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP for (u = 0; u < n; u++) { prev_nbr[u] = -1; } tree_first_descendant(n, TRUE, first_child, next_child, ipostorder, first_descendant); #endif FREE(first_child); FREE(next_child); for (p = 0; p < n; p++) { jp = postorder[p]; if (parent[jp] != n) { wt[parent[jp]] --; } for (ip = (A->colptr)[jp]; ip < (A->colptr)[jp + 1]; ip++) { ju = (A->rowind)[ip]; if (ju == jp) { continue; /* we only want proper neighbors */ } #ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP if (first_descendant[jp] > prev_nbr[u]) { #else if (1) { #endif wt[jp] ++; pprime = prev_p[ju]; if (pprime == -1) { l_rc[ju] += level[jp] - level[ju]; } else { q = ordered_uf_find(uf, pprime); l_rc[ju] += level[jp] - level[q]; wt[q] --; } prev_p[ju] = jp; } #ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP prev_nbr[u] = p; #endif } if (parent[jp] != n) { if (!(ipostorder[parent[jp]] > ipostorder[jp])) { sciprint("jp %d parent %d (ipo_j %d ipo_parent %d\n\r", jp, parent[jp], ipostorder[jp], ipostorder[parent[jp]]); } assert(ipostorder[parent[jp]] > ipostorder[jp]); ordered_uf_union(uf, jp, parent[jp]); } } *l_nz = 0; for (u2 = 0; u2 < n; u2++) { l_cc[u2] = wt[u2]; *l_nz += wt[u2]; } for (u2 = 0; u2 < n; u2++) { if (parent[u2] != n) { l_cc[parent[u2]] += l_cc[u2]; *l_nz += l_cc[u2]; } } /* free scrtach vectors */ if (!l_colcount) { FREE(l_cc); } if (!l_rowcount) { FREE(l_rc); } /* free other data structures */ FREE(postorder); FREE(ipostorder); FREE(wt); FREE(level); FREE(prev_p); #ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP FREE(prev_nbr); FREE(first_descendant); #endif } FREE(uf); return 0; } int taucs_ccs_etree_liu(taucs_ccs_matrix* A, int* parent, int* l_colcount, int* l_rowcount, int* l_nnz) { int n = A->n; int i, j, k, ip, kp; int nnz, jnnz; int* uf; int* rowptr; int* colind; int* rowcount; int* marker; int* realroot; int* l_cc; int* l_rc; /* we need the row structures for the lower triangle */ nnz = (A->colptr)[n]; uf = (int*)MALLOC(n * sizeof(int)); rowcount = (int*)MALLOC((n + 1) * sizeof(int)); rowptr = (int*)MALLOC((n + 1) * sizeof(int)); colind = (int*)MALLOC(nnz * sizeof(int)); for (i = 0; i <= n; i++) { rowcount[i] = 0; } for (j = 0; j < n; j++) { jnnz = (A->colptr)[j + 1] - (A->colptr)[j]; for (ip = 0; ip < jnnz; ip++) { i = (A->rowind)[(A->colptr)[j] + ip]; if (j < i) { rowcount[i]++; } } } ip = 0; for (i = 0; i <= n; i++) { int next_ip = ip + rowcount[i]; rowcount[i] = ip; rowptr [i] = ip; ip = next_ip; } for (j = 0; j < n; j++) { jnnz = (A->colptr)[j + 1] - (A->colptr)[j]; for (ip = 0; ip < jnnz; ip++) { i = (A->rowind)[(A->colptr)[j] + ip]; if (i == j) { continue; } assert( rowcount[i] < rowptr[i + 1] ); colind[ rowcount[i] ] = j; rowcount[i]++; } } /* now compute the etree */ { int u, t, vroot; realroot = rowcount; /* reuse space */ for (i = 0; i < n; i++) { uf_makeset(uf, i); realroot[i] = i; parent[i] = n; vroot = i; for (kp = rowptr[i]; kp < rowptr[i + 1]; kp++) { k = colind[kp]; u = uf_find(uf, k); t = realroot[u]; if (parent[t] == n && t != i) { parent[t] = i; vroot = uf_union(uf, vroot, u); realroot[vroot] = i; } } } } /* compute column counts */ if (l_colcount || l_rowcount || l_nnz) { int* l_nz; int tmp; /* we allocate scratch vectors to avoid conditionals */ /* in the inner loop. */ if (l_colcount) { l_cc = l_colcount; } else { l_cc = (int*) MALLOC(n * sizeof(int)); } if (l_rowcount) { l_rc = l_rowcount; } else { l_rc = (int*) MALLOC(n * sizeof(int)); } if (l_nnz) { l_nz = l_nnz; } else { l_nz = &tmp; } marker = rowcount; /* we reuse the space */ for (j = 0; j < n; j++) { l_cc[j] = 1; } *l_nz = n; for (i = 0; i < n; i++) { marker[i] = n; /* clear the array */ } for (i = 0; i < n; i++) { l_rc[i] = 1; marker[ i ] = i; for (kp = rowptr[i]; kp < rowptr[i + 1]; kp++) { k = colind[kp]; j = k; while (marker[j] != i) { l_cc[j]++; l_rc[i]++; (*l_nz)++; marker[j] = i; j = parent[j]; } } } /* free scrtach vectors */ if (!l_colcount) { FREE(l_cc); } if (!l_rowcount) { FREE(l_rc); } } FREE(colind); FREE(rowptr); FREE(rowcount); FREE(uf); return 0; } static void recursive_symbolic_elimination(int j, taucs_ccs_matrix* A, int first_child[], int next_child[], int* n_sn, int sn_size[], int sn_up_size[], int* sn_rowind[], int sn_first_child[], int sn_next_child[], int rowind[], int column_to_sn_map[], int map[], int do_order, int ipostorder[] ) { int i, ip, c, c_sn; int in_previous_sn; int nnz; for (c = first_child[j]; c != -1; c = next_child[c]) { recursive_symbolic_elimination(c, A, first_child, next_child, n_sn, sn_size, sn_up_size, sn_rowind, sn_first_child, sn_next_child, rowind, /* temporary */ column_to_sn_map, map, do_order, ipostorder ); } in_previous_sn = 1; if (j == A->n) { in_previous_sn = 0; /* this is not a real column */ } else if (first_child[j] == -1) { in_previous_sn = 0; /* this is a leaf */ } else if (next_child[first_child[j]] != -1) { in_previous_sn = 0; /* more than 1 child */ } else { /* check that the structure is nested */ /* map contains child markers */ c = first_child[j]; for (ip = (A->colptr)[j]; ip < (A->colptr)[j + 1]; ip++) { i = (A->rowind)[ip]; in_previous_sn = in_previous_sn && (map[i] == c); } } if (in_previous_sn) { c = first_child[j]; c_sn = column_to_sn_map[c]; column_to_sn_map[j] = c_sn; /* swap row indices so j is at the end of the */ /* supernode, not in the update indices */ for (ip = sn_size[c_sn]; ip < sn_up_size[c_sn]; ip++) if (sn_rowind[c_sn][ip] == j) { break; } assert(ip < sn_up_size[c_sn]); sn_rowind[c_sn][ip] = sn_rowind[c_sn][sn_size[c_sn]]; sn_rowind[c_sn][sn_size[c_sn]] = j; /* mark the nonzeros in the map */ for (ip = sn_size[c_sn]; ip < sn_up_size[c_sn]; ip++) { map[ sn_rowind[c_sn][ip] ] = j; } sn_size [c_sn]++; return; } /* we are in a new supernode */ if (j < A->n) { nnz = 1; rowind[0] = j; map[j] = j; for (c = first_child[j]; c != -1; c = next_child[c]) { c_sn = column_to_sn_map[c]; for (ip = sn_size[c_sn]; ip < sn_up_size[c_sn]; ip++) { i = sn_rowind[c_sn][ip]; if (i > j && map[i] != j) /* new row index */ { map[i] = j; rowind[nnz] = i; nnz++; } } } for (ip = (A->colptr)[j]; ip < (A->colptr)[j + 1]; ip++) { i = (A->rowind)[ip]; if (map[i] != j) /* new row index */ { map[i] = j; rowind[nnz] = i; nnz++; } } } /*printf("children of sn %d: ",*n_sn);*/ for (c = first_child[j]; c != -1; c = next_child[c]) { c_sn = column_to_sn_map[c]; /*printf("%d ",c_sn);*/ if (c == first_child[j]) { sn_first_child[*n_sn] = c_sn; } else { sn_next_child[ c_sn ] = sn_first_child[*n_sn]; sn_first_child[*n_sn] = c_sn; } } /*printf("\n");*/ if (j < A->n) { column_to_sn_map[j] = *n_sn; sn_size [*n_sn] = 1; sn_up_size[*n_sn] = nnz; sn_rowind [*n_sn] = (int*) MALLOC(nnz * sizeof(int)); for (ip = 0; ip < nnz; ip++) { sn_rowind[*n_sn][ip] = rowind[ip]; } if (do_order) { /* Sivan and Vladimir: we think that we can sort in */ /* column order, not only in etree postorder. */ /* radix_sort(sn_rowind [*n_sn],nnz); qsort(sn_rowind [*n_sn],nnz,sizeof(int),compare_ints); */ compare_indirect_map = ipostorder; qsort(sn_rowind [*n_sn], nnz, sizeof(int), compare_indirect_ints); } assert(sn_rowind [*n_sn][0] == j); (*n_sn)++; } return; } /* count zeros and nonzeros in a supernode to compute the */ /* utility of merging fundamental supernodes. */ typedef struct { double zeros; double nonzeros; } znz; static znz recursive_amalgamate_supernodes(int sn, int* n_sn, int sn_size[], int sn_up_size[], int* sn_rowind[], int sn_first_child[], int sn_next_child[], int rowind[], int column_to_sn_map[], int map[], int do_order, int ipostorder[] ) { int i, ip, c_sn, gc_sn; int nnz; int nchildren; /* number of children, child index */ znz* c_znz; znz sn_znz, merged_znz; int new_sn_size, new_sn_up_size; sn_znz.zeros = 0.0; sn_znz.nonzeros = (double) (((sn_up_size[sn] - sn_size[sn]) * sn_size[sn]) + (sn_size[sn] * (sn_size[sn] + 1)) / 2); if (sn_first_child[sn] == -1) /* leaf */ { return sn_znz; } nchildren = 0; for (c_sn = sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) { nchildren++; } c_znz = (znz*) alloca(nchildren * sizeof(znz)); //printf("supernode %d out of %d\n",sn,*n_sn); /* merge the supernode with its children! */ i = 0; for (c_sn = sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) { c_znz[i] = recursive_amalgamate_supernodes(c_sn, n_sn, sn_size, sn_up_size, sn_rowind, sn_first_child, sn_next_child, rowind, /* temporary */ column_to_sn_map, map, do_order, ipostorder ); assert(c_znz[i].zeros + c_znz[i].nonzeros == (double) (((sn_up_size[c_sn] - sn_size[c_sn]) * sn_size[c_sn]) + (sn_size[c_sn] * (sn_size[c_sn] + 1)) / 2 )); i++; } merged_znz.nonzeros = sn_znz.nonzeros; merged_znz.zeros = sn_znz.zeros; for (i = 0; i < nchildren; i++) { merged_znz.nonzeros += (c_znz[i]).nonzeros; merged_znz.zeros += (c_znz[i]).zeros; } /* printf("supernode %d out of %d (continuing)\n",sn,*n_sn);*/ /* should we merge the supernode with its children? */ nnz = 0; for (c_sn = sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) { for (ip = 0; ip < sn_size[c_sn]; ip++) { i = sn_rowind[c_sn][ip]; assert( map[i] != sn ); map[i] = sn; rowind[nnz] = i; nnz++; } } for (ip = 0; ip < sn_size[sn]; ip++) { i = sn_rowind[sn][ip]; assert( map[i] != sn ); map[i] = sn; rowind[nnz] = i; nnz++; } new_sn_size = nnz; for (c_sn = sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) { for (ip = sn_size[c_sn]; ip < sn_up_size[c_sn]; ip++) { i = sn_rowind[c_sn][ip]; if (map[i] != sn) /* new row index */ { map[i] = sn; rowind[nnz] = i; nnz++; } } } for (ip = sn_size[sn]; ip < sn_up_size[sn]; ip++) { i = sn_rowind[sn][ip]; if (map[i] != sn) /* new row index */ { map[i] = sn; rowind[nnz] = i; nnz++; } } new_sn_up_size = nnz; if (do_order) { compare_indirect_map = ipostorder; qsort(rowind, nnz, sizeof(int), compare_indirect_ints); } /* determine whether we should merge the supernode and its children */ { int n; double* zcount; n = 0; for (ip = 0; ip < nnz; ip++) { i = rowind[ip]; if (i >= n) { n = i + 1; } } zcount = (double*) malloc(n * sizeof(double)); assert(zcount); for (ip = 0; ip < new_sn_size; ip++) { i = rowind[ip]; assert(i < n); zcount[i] = (double) (ip + 1); } for (ip = new_sn_size; ip < new_sn_up_size; ip++) { i = rowind[ip]; assert(i < n); zcount[i] = (double) new_sn_size; } /* for (ip=0; ip= 0.0); merged_znz.zeros += zcount[i]; } FREE( zcount ); /*printf("zeros after merging %.0f\n",merged_znz.zeros);*/ /* voodoo constants (need some kind of a utility function */ if ((new_sn_size < 16) || ((sn_size[sn] < 50) && (merged_znz.zeros < 0.5 * merged_znz.nonzeros)) || ((sn_size[sn] < 250) && (merged_znz.zeros < 0.25 * merged_znz.nonzeros)) || ((sn_size[sn] < 500) && (merged_znz.zeros < 0.10 * merged_znz.nonzeros)) || (merged_znz.zeros < 0.05 * merged_znz.nonzeros) ) { /* taucs_printf("merging sn %d, zeros (%f) vs nonzeros (%f)\n", sn,merged_znz.zeros,merged_znz.nonzeros); */ } else { /* taucs_printf("sn %d, too many zeros (%f) vs nonzeros (%f)\n", sn,merged_znz.zeros,merged_znz.nonzeros); printf("returning without merging\n"); */ return sn_znz; } } /* now merge the children lists */ sn_size[sn] = new_sn_size; sn_up_size[sn] = new_sn_up_size; sn_rowind[sn] = (int*) REALLOC(sn_rowind[sn], new_sn_up_size * sizeof(int)); for (ip = 0; ip < new_sn_up_size; ip++) { sn_rowind[sn][ip] = rowind[ip]; } // printf("supernode %d out of %d (merging)\n",sn,*n_sn); nchildren = 0; for (c_sn = sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) { for (ip = 0; ip < sn_size[c_sn]; ip++) { i = (sn_rowind[c_sn])[ip]; assert(column_to_sn_map[i] == c_sn); column_to_sn_map[i] = sn; } for (gc_sn = sn_first_child[c_sn]; gc_sn != -1; gc_sn = sn_next_child[gc_sn]) { rowind[nchildren] = gc_sn; nchildren++; } } /* free the children's rowind vectors */ for (c_sn = sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) { FREE( sn_rowind[c_sn] ); sn_rowind[c_sn] = NULL; sn_size[c_sn] = 0; sn_up_size[c_sn] = 0; } sn_first_child[sn] = -1; for (i = 0; i < nchildren; i++) { sn_next_child[ rowind[i] ] = sn_first_child[sn]; sn_first_child[sn] = rowind[i]; } /* printf("supernode %d out of %d (done)\n",sn,*n_sn); printf("returning, merging\n"); */ return merged_znz; } int taucs_ccs_symbolic_elimination(taucs_ccs_matrix* A, void* vL, int do_order ) { supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL; int* first_child; int* next_child; int j; int* column_to_sn_map; int* map; int* rowind; int* parent; int* ipostorder; L->n = A->n; L->sn_struct = (int**)MALLOC((A->n ) * sizeof(int*)); L->sn_size = (int*) MALLOC((A->n + 1) * sizeof(int)); L->sn_up_size = (int*) MALLOC((A->n + 1) * sizeof(int)); L->first_child = (int*) MALLOC((A->n + 1) * sizeof(int)); L->next_child = (int*) MALLOC((A->n + 1) * sizeof(int)); column_to_sn_map = (int*)MALLOC((A->n + 1) * sizeof(int)); map = (int*) MALLOC((A->n + 1) * sizeof(int)); first_child = (int*) MALLOC(((A->n) + 1) * sizeof(int)); next_child = (int*) MALLOC(((A->n) + 1) * sizeof(int)); rowind = (int*) MALLOC((A->n) * sizeof(int)); /* compute the vertex elimination tree */ parent = (int*)MALLOC((A->n + 1) * sizeof(int)); taucs_ccs_etree(A, parent, NULL, NULL, NULL); if (0) { int *cc1, *cc2, *rc1, *rc2; int *p1; int nnz1, nnz2; cc1 = (int*)MALLOC((A->n) * sizeof(int)); cc2 = (int*)MALLOC((A->n) * sizeof(int)); rc1 = (int*)MALLOC((A->n) * sizeof(int)); rc2 = (int*)MALLOC((A->n) * sizeof(int)); p1 = (int*)MALLOC((A->n) * sizeof(int)); taucs_ccs_etree_liu(A, parent, cc1, rc1, &nnz1); taucs_ccs_etree(A, p1, cc2, rc2, &nnz2); for (j = 0; j < (A->n); j++) { assert(parent[j] == p1[j]); } for (j = 0; j < (A->n); j++) { if (cc1[j] != cc2[j]) { sciprint("j=%d cc1=%d cc2=%d\n", j, cc1[j], cc2[j]); } assert(cc1[j] == cc2[j]); } for (j = 0; j < (A->n); j++) { if (rc1[j] != rc2[j]) { sciprint("j=%d rc1=%d rc2=%d\n", j, rc1[j], rc2[j]); } assert(rc1[j] == rc2[j]); } if (nnz1 != nnz2) { sciprint("nnz1=%d nnz2=%d\n", nnz1, nnz2); } FREE(cc1); FREE(cc2); FREE(rc1); FREE(rc2); } for (j = 0; j <= (A->n); j++) { first_child[j] = -1; } for (j = (A->n) - 1; j >= 0; j--) { int p = parent[j]; next_child[j] = first_child[p]; first_child[p] = j; } FREE(parent); ipostorder = (int*)MALLOC((A->n + 1) * sizeof(int)); { int next = 0; /*int* postorder = (int*)MALLOC((A->n+1)*sizeof(int));*/ recursive_postorder(A->n, first_child, next_child, NULL, ipostorder, &next); /* printf("ipostorder "); for (j=0; j <= (A->n); j++) printf("%d ",ipostorder[j]); printf("\n"); printf(" postorder "); for (j=0; j <= (A->n); j++) printf("%d ",postorder[j]); printf("\n"); */ } L->n_sn = 0; for (j = 0; j < (A->n); j++) { map[j] = -1; } for (j = 0; j <= (A->n); j++) { (L->first_child)[j] = (L->next_child)[j] = -1; } recursive_symbolic_elimination(A->n, A, first_child, next_child, &(L->n_sn), L->sn_size, L->sn_up_size, L->sn_struct, L->first_child, L->next_child, rowind, column_to_sn_map, map, do_order, ipostorder ); { double nnz = 0.0; double flops = 0.0; int sn, i, colnnz; for (sn = 0; sn < (L->n_sn); sn++) { for (i = 0, colnnz = (L->sn_up_size)[sn]; i < (L->sn_size)[sn]; i++, colnnz--) { flops += ((double)(colnnz) - 1.0) * ((double)(colnnz) + 2.0) / 2.0; nnz += (double) (colnnz); } } /*sciprint("\t\tSymbolic Analysis of CC^T: %.2e nonzeros, %.2e flops\n", nnz, flops); */ } for (j = 0; j < (A->n); j++) { map[j] = -1; } (void) recursive_amalgamate_supernodes((L->n_sn) - 1, &(L->n_sn), L->sn_size, L->sn_up_size, L->sn_struct, L->first_child, L->next_child, rowind, column_to_sn_map, map, do_order, ipostorder ); { double nnz = 0.0; double flops = 0.0; int sn, i, colnnz; for (sn = 0; sn < (L->n_sn); sn++) { for (i = 0, colnnz = (L->sn_up_size)[sn]; i < (L->sn_size)[sn]; i++, colnnz--) { flops += ((double)(colnnz) - 1.0) * ((double)(colnnz) + 2.0) / 2.0; nnz += (double) (colnnz); } } /*sciprint("\t\tRelaxed Analysis of LL^T: %.2e nonzeros, %.2e flops\n", nnz, flops); */ } /* { int i; printf("c2sn: "); for (i=0; in; i++) printf("%d ",column_to_sn_map[i]); printf("\n"); } */ L->sn_blocks_ld = (int*)MALLOC((L->n_sn) * sizeof(int)); L->sn_blocks = (double**)CALLOC((L->n_sn), sizeof(double*)); /* so we can free before allocation */ L->up_blocks_ld = (int*)MALLOC((L->n_sn) * sizeof(int)); L->up_blocks = (double**)CALLOC((L->n_sn), sizeof(double*)); FREE(rowind); FREE(map); FREE(column_to_sn_map); FREE(next_child); FREE(first_child); FREE(ipostorder); return 0; } /*************************************************************/ /* factor routines */ /*************************************************************/ static supernodal_frontal_matrix* recursive_multifrontal_supernodal_factor_llt(int sn, /* this supernode */ int is_root, /* is v the root? */ int* bitmap, taucs_ccs_matrix* A, supernodal_factor_matrix* snL, int* fail) { supernodal_frontal_matrix* my_matrix = NULL; supernodal_frontal_matrix* child_matrix = NULL; int child; int* v; int sn_size; int* first_child = snL->first_child; int* next_child = snL->next_child; v = &( snL->sn_struct[sn][0] ); if (!is_root) { sn_size = snL->sn_size[sn]; } else { sn_size = -1; } for (child = first_child[sn]; child != -1; child = next_child[child]) { child_matrix = recursive_multifrontal_supernodal_factor_llt(child, FALSE, bitmap, A, snL, fail); if (*fail) { if (child_matrix) { supernodal_frontal_free(child_matrix); } return NULL; } if (!is_root) { if (!my_matrix) { my_matrix = supernodal_frontal_create(v, sn_size, snL->sn_up_size[sn], snL->sn_struct[sn]); if (!my_matrix) { *fail = TRUE; supernodal_frontal_free(child_matrix); return NULL; } } multifrontal_supernodal_front_extend_add(my_matrix, child_matrix, bitmap); supernodal_frontal_free(child_matrix); } } /* in case we have no children, we allocate now */ if (!is_root && !my_matrix) my_matrix = supernodal_frontal_create(v, sn_size, snL->sn_up_size[sn], snL->sn_struct[sn]); if (!is_root) { if (multifrontal_supernodal_front_factor(sn, v, sn_size, A, my_matrix, bitmap, snL)) { /* nonpositive pivot */ *fail = TRUE; supernodal_frontal_free(my_matrix); return NULL; } } return my_matrix; } void* taucs_ccs_factor_llt_mf(taucs_ccs_matrix* A) { supernodal_factor_matrix* L; int* map; int fail; L = multifrontal_supernodal_create(); taucs_ccs_symbolic_elimination(A, L, FALSE /* don't sort row indices */ ); map = (int*)MALLOC((A->n + 1) * sizeof(int)); fail = FALSE; recursive_multifrontal_supernodal_factor_llt((L->n_sn), TRUE, map, A, L, &fail); FREE(map); if (fail) { taucs_supernodal_factor_free(L); return NULL; } return (void*) L; } /*************************************************************/ /* supernodal solve routines */ /*************************************************************/ static void recursive_supernodal_solve_l(int sn, /* this supernode */ int is_root, /* is v the root? */ int* first_child, int* next_child, int** sn_struct, int* sn_sizes, int* sn_up_sizes, int* sn_blocks_ld, double* sn_blocks[], int* up_blocks_ld, double* up_blocks[], double x[], double b[], double t[]) { int child; int sn_size; /* number of rows/columns in the supernode */ int up_size; /* number of rows that this supernode updates */ int ione = 1; double done = 1.0; double dzero = 0.0; double* xdense; double* bdense; double flops; int i, j, ip, jp; for (child = first_child[sn]; child != -1; child = next_child[child]) { recursive_supernodal_solve_l(child, FALSE, first_child, next_child, sn_struct, sn_sizes, sn_up_sizes, sn_blocks_ld, sn_blocks, up_blocks_ld, up_blocks, x, b, t); } if (!is_root) { sn_size = sn_sizes[sn]; up_size = sn_up_sizes[sn] - sn_sizes[sn]; flops = ((double)sn_size) * ((double)sn_size) + 2.0 * ((double)sn_size) * ((double)up_size); if (flops > BLAS_FLOPS_CUTOFF) { xdense = t; bdense = t + sn_size; for (i = 0; i < sn_size; i++) { xdense[i] = b[ sn_struct[ sn ][ i ] ]; } for (i = 0; i < up_size; i++) { bdense[i] = 0.0; } F2C(dtrsm)("Left", "Lower", "No Transpose", "No unit diagonal", &sn_size, &ione, &done, sn_blocks[sn], &(sn_blocks_ld[sn]), xdense , &sn_size); if ((up_size > 0) & (sn_size > 0)) { F2C(dgemm)("No Transpose", "No Transpose", &up_size, &ione, &sn_size, &done, up_blocks[sn], &(up_blocks_ld[sn]), xdense , &sn_size, &dzero, bdense , &up_size); } for (i = 0; i < sn_size; i++) { x[ sn_struct[ sn][ i ] ] = xdense[i]; } for (i = 0; i < up_size; i++) { b[ sn_struct[ sn ][ sn_size + i ] ] -= bdense[i]; } } else if (sn_size > SOLVE_DENSE_CUTOFF) { xdense = t; bdense = t + sn_size; for (i = 0; i < sn_size; i++) { xdense[i] = b[ sn_struct[ sn ][ i ] ]; } for (i = 0; i < up_size; i++) { bdense[i] = 0.0; } for (jp = 0; jp < sn_size; jp++) { xdense[jp] = xdense[jp] / sn_blocks[sn][ sn_blocks_ld[sn] * jp + jp]; for (ip = jp + 1; ip < sn_size; ip++) { xdense[ip] -= xdense[jp] * sn_blocks[sn][ sn_blocks_ld[sn] * jp + ip]; } } for (jp = 0; jp < sn_size; jp++) { for (ip = 0; ip < up_size; ip++) { bdense[ip] += xdense[jp] * up_blocks[sn][ up_blocks_ld[sn] * jp + ip]; } } for (i = 0; i < sn_size; i++) { x[ sn_struct[ sn][ i ] ] = xdense[i]; } for (i = 0; i < up_size; i++) { b[ sn_struct[ sn ][ sn_size + i ] ] -= bdense[i]; } } else { for (jp = 0; jp < sn_size; jp++) { j = sn_struct[sn][jp]; x[j] = b[j] / sn_blocks[sn][ sn_blocks_ld[sn] * jp + jp]; for (ip = jp + 1; ip < sn_size; ip++) { i = sn_struct[sn][ip]; b[i] -= x[j] * sn_blocks[sn][ sn_blocks_ld[sn] * jp + ip]; } for (ip = 0; ip < up_size; ip++) { i = sn_struct[sn][sn_size + ip]; b[i] -= x[j] * up_blocks[sn][ up_blocks_ld[sn] * jp + ip]; } } } } } static void recursive_supernodal_solve_lt(int sn, /* this supernode */ int is_root, /* is v the root? */ int* first_child, int* next_child, int** sn_struct, int* sn_sizes, int* sn_up_sizes, int* sn_blocks_ld, double* sn_blocks[], int* up_blocks_ld, double* up_blocks[], double x[], double b[], double t[]) { int child; int sn_size; /* number of rows/columns in the supernode */ int up_size; /* number of rows that this supernode updates */ int ione = 1; double done = 1.0; double dminusone = -1.0; double* xdense; double* bdense; double flops; int i, j, ip, jp; if (!is_root) { sn_size = sn_sizes[sn]; up_size = sn_up_sizes[sn] - sn_sizes[sn]; flops = ((double)sn_size) * ((double)sn_size) + 2.0 * ((double)sn_size) * ((double)up_size); if (flops > BLAS_FLOPS_CUTOFF) { bdense = t; xdense = t + sn_size; for (i = 0; i < sn_size; i++) { bdense[i] = b[ sn_struct[ sn][ i ] ]; } for (i = 0; i < up_size; i++) { xdense[i] = x[ sn_struct[sn][sn_size + i] ]; } if ((up_size > 0) & (sn_size > 0)) F2C(dgemm)("Transpose", "No Transpose", &sn_size, &ione, &up_size, &dminusone, up_blocks[sn], &(up_blocks_ld[sn]), xdense , &up_size, &done, bdense , &sn_size); F2C(dtrsm)("Left", "Lower", "Transpose", "No unit diagonal", &sn_size, &ione, &done, sn_blocks[sn], &(sn_blocks_ld[sn]), bdense , &sn_size); for (i = 0; i < sn_size; i++) { x[ sn_struct[ sn][ i ] ] = bdense[i]; } } else if (sn_size > SOLVE_DENSE_CUTOFF) { bdense = t; xdense = t + sn_size; for (i = 0; i < sn_size; i++) { bdense[i] = b[ sn_struct[ sn][ i ] ]; } for (i = 0; i < up_size; i++) { xdense[i] = x[ sn_struct[sn][sn_size + i] ]; } for (ip = sn_size - 1; ip >= 0; ip--) { for (jp = 0; jp < up_size; jp++) { bdense[ip] -= xdense[jp] * up_blocks[sn][ up_blocks_ld[sn] * ip + jp]; } } for (ip = sn_size - 1; ip >= 0; ip--) { for (jp = sn_size - 1; jp > ip; jp--) { bdense[ip] -= bdense[jp] * sn_blocks[sn][ sn_blocks_ld[sn] * ip + jp]; } bdense[ip] = bdense[ip] / sn_blocks[sn][ sn_blocks_ld[sn] * ip + ip]; } for (i = 0; i < sn_size; i++) { x[ sn_struct[ sn][ i ] ] = bdense[i]; } } else { for (ip = sn_size - 1; ip >= 0; ip--) { i = sn_struct[sn][ip]; for (jp = 0; jp < up_size; jp++) { j = sn_struct[sn][sn_size + jp]; b[i] -= x[j] * up_blocks[sn][ up_blocks_ld[sn] * ip + jp]; } for (jp = sn_size - 1; jp > ip; jp--) { j = sn_struct[sn][jp]; b[i] -= x[j] * sn_blocks[sn][ sn_blocks_ld[sn] * ip + jp]; } x[i] = b[i] / sn_blocks[sn][ sn_blocks_ld[sn] * ip + ip]; } } } for (child = first_child[sn]; child != -1; child = next_child[child]) { recursive_supernodal_solve_lt(child, FALSE, first_child, next_child, sn_struct, sn_sizes, sn_up_sizes, sn_blocks_ld, sn_blocks, up_blocks_ld, up_blocks, x, b, t); } } int taucs_supernodal_solve_llt(void* vL, double* x, double* b) { supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL; double* y; double* t; /* temporary vector */ int i; y = MALLOC((L->n) * sizeof(double)); t = MALLOC((L->n) * sizeof(double)); if (!y || !t) { FREE(y); FREE(t); sciprint(_("%s: No more memory.\n"), "multifrontal_supernodal_solve_llt"); return -1; } for (i = 0; i < L->n; i++) { x[i] = b[i]; } recursive_supernodal_solve_l (L->n_sn, TRUE, /* this is the root */ L->first_child, L->next_child, L->sn_struct, L->sn_size, L->sn_up_size, L->sn_blocks_ld, L->sn_blocks, L->up_blocks_ld, L->up_blocks, y, x, t); recursive_supernodal_solve_lt(L->n_sn, TRUE, /* this is the root */ L->first_child, L->next_child, L->sn_struct, L->sn_size, L->sn_up_size, L->sn_blocks_ld, L->sn_blocks, L->up_blocks_ld, L->up_blocks, x, y, t); FREE(y); FREE(t); return 0; } /***************************************************** * some utility routines * *****************************************************/ taucs_ccs_matrix* taucs_supernodal_factor_to_ccs(void* vL) { supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL; taucs_ccs_matrix* C; int n, nnz; int i, j, ip, jp, sn, next; double v; int* len; n = L->n; len = (int*) MALLOC(n * sizeof(int)); if (!len) { return NULL; } nnz = 0; for (sn = 0; sn < L->n_sn; sn++) { for (jp = 0; jp < (L->sn_size)[sn]; jp++) { j = (L->sn_struct)[sn][jp]; len[j] = 0; for (ip = jp; ip < (L->sn_size)[sn]; ip++) { v = (L->sn_blocks)[sn][ jp * (L->sn_blocks_ld)[sn] + ip ]; if (v) { len[j] ++; nnz ++; } } for (ip = (L->sn_size)[sn]; ip < (L->sn_up_size)[sn]; ip++) { v = (L->up_blocks)[sn][ jp * (L->up_blocks_ld)[sn] + (ip - (L->sn_size)[sn]) ]; if (v) { len[j] ++; nnz ++; } } } } C = taucs_ccs_create(n, n, nnz); if (!C) { FREE(len); return NULL; } C->flags = TAUCS_TRIANGULAR | TAUCS_LOWER; (C->colptr)[0] = 0; for (j = 1; j <= n; j++) { (C->colptr)[j] = (C->colptr)[j - 1] + len[j - 1]; } FREE(len); for (sn = 0; sn < L->n_sn; sn++) { for (jp = 0; jp < (L->sn_size)[sn]; jp++) { j = (L->sn_struct)[sn][jp]; next = (C->colptr)[j]; for (ip = jp; ip < (L->sn_size)[sn]; ip++) { i = (L->sn_struct)[sn][ ip ]; v = (L->sn_blocks)[sn][ jp * (L->sn_blocks_ld)[sn] + ip ]; if (v == 0.0) { continue; } (C->rowind)[next] = i; (C->values)[next] = v; next++; } for (ip = (L->sn_size)[sn]; ip < (L->sn_up_size)[sn]; ip++) { i = (L->sn_struct)[sn][ ip ]; v = (L->up_blocks)[sn][ jp * (L->up_blocks_ld)[sn] + (ip - (L->sn_size)[sn]) ]; if (v == 0.0) { continue; } (C->rowind)[next] = i; (C->values)[next] = v; next++; } } } return C; } /* * from the preeeding routine here is a small one to get only * the number of non zero elements of the factor (Bruno le 29/11/2001) * In fact the supernodal struct keeps on some exact zeros and so the * nnz get here is representative of the memory used by the supernodal * struc. */ int taucs_get_nnz_from_supernodal_factor(void* vL) { supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL; int nnz = 0; int jp = 0, sn = 0; nnz = 0; for (sn = 0; sn < L->n_sn; sn++) for (jp = 0; jp < (L->sn_size)[sn]; jp++) { nnz += (L->sn_up_size)[sn] - jp; } return ( nnz ); }