virtual void chain() { using Eigen::MatrixXd; using Eigen::Lower; using Eigen::Block; using Eigen::Upper; using Eigen::StrictlyUpper; using Eigen::StrictlyLower; MatrixXd Lbar(M_, M_); MatrixXd Lbar1(M_, M_); MatrixXd L(M_, M_); Lbar.setZero(); L.setZero(); size_t pos = 0; for (size_type j = 0; j < M_; ++j) { for (size_type i = j; i < M_; ++i) { Lbar.coeffRef(i, j) = variRefL_[pos]->adj_; L.coeffRef(i, j) = variRefL_[pos]->val_; ++pos; } } static const char * custom_kernel_lower_upper = "__kernel void copy_lower_tri_upper_tri(\n" " __global double * vec1,\n" " unsigned int M,\n" " unsigned int N, \n" " unsigned int Npad) \n" "{ \n" " int i = get_global_id(0); \n" " int j = get_global_id(1); \n" " if(i < M && j< N && i0){ \n" " for (unsigned int j = ii; j < part_size; j++) { \n" " faktor=ap[(i+ii-1)*Mpad+(j+i)]; \n" " for (unsigned int k = 0; k < part_size; k++) { \n" " vv[offset+j*(part_size_fixed+1)+k]=vv[offset+j*(part_size_fixed+1)+k]-faktor*vv[offset+(ii-1)*(part_size_fixed+1)+k]; \n" " } \n" " } \n" " } \n" " faktor=ap[(ii+i)*Mpad+ii+i]; \n" " for (unsigned int k = 0; k < part_size; k++) { \n" " vv[offset+ii*(part_size_fixed+1)+k]=vv[offset+ii*(part_size_fixed+1)+k]/faktor; \n" " } \n" " } \n" " \n" " for(int p=0;p vcl_L(M_, M_); viennacl::matrix vcl_temp(M_, M_); viennacl::matrix vcl_Lbar(M_, M_); viennacl::copy(L, vcl_L); viennacl::copy(Lbar, vcl_Lbar); vcl_L = viennacl::trans(vcl_L); vcl_Lbar = viennacl::linalg::prod(vcl_L, vcl_Lbar); viennacl::ocl::program & my_prog = viennacl::ocl::current_context().add_program(custom_kernel_lower_upper, "custom_kernel_lower_upper"); viennacl::ocl::kernel & my_kernel_mul = my_prog.get_kernel("copy_lower_tri_upper_tri"); viennacl::ocl::program & my_prog_diag = viennacl::ocl::current_context().add_program(custom_kernel_diagonal, "custom_kernel_diagonal"); viennacl::ocl::kernel & my_kernel_diag = my_prog_diag.get_kernel("diagonal_mul"); viennacl::ocl::program & my_prog_inv1 = viennacl::ocl::current_context().add_program(custom_kernel_inverse_step1, "custom_kernel_inverse_step1"); viennacl::ocl::kernel & my_kernel_inv1 = my_prog_inv1.get_kernel("inverse_step1"); viennacl::ocl::program & my_prog_inv2_3 = viennacl::ocl::current_context().add_program(custom_kernel_inverse_step2_3, "custom_kernel_inverse_step2_3"); viennacl::ocl::kernel & my_kernel_inv2 = my_prog_inv2_3.get_kernel("inverse_step2"); viennacl::ocl::kernel & my_kernel_inv3 = my_prog_inv2_3.get_kernel("inverse_step3"); my_kernel_mul.global_work_size(0,vcl_Lbar.internal_size1()); my_kernel_mul.global_work_size(1,vcl_Lbar.internal_size2()); my_kernel_mul.local_work_size(0,32);my_kernel_mul.local_work_size(1,32); viennacl::ocl::enqueue(my_kernel_mul(vcl_Lbar, static_cast(vcl_Lbar.size1()), static_cast(vcl_Lbar.size2()), static_cast(vcl_Lbar.internal_size2()))); int parts=32; if(M_>2500) parts=64; //need to setup a smarter way of doing this int remainder=M_%parts; int part_size_fixed=M_/parts; viennacl::matrix vcl_temp2(M_, M_*2); std::vector stl_sizes(parts); viennacl::vector vcl_sizes(parts); for(int i=0;i(remainder), static_cast(part_size_fixed), static_cast(vcl_L.size2()),static_cast(vcl_L.internal_size2()))); int repeat=1; int sizePad; for(int pp=parts;pp>1;pp/=2){ sizePad=(((part_size_fixed+1)*repeat+31)/32)*32; my_kernel_inv2.global_work_size(0,sizePad);my_kernel_inv2.global_work_size(1,sizePad/4);my_kernel_inv2.global_work_size(2, pp/2); my_kernel_inv2.local_work_size(0,32);my_kernel_inv2.local_work_size(1,32/4);my_kernel_inv2.local_work_size(2,1); my_kernel_inv3.global_work_size(0,sizePad);my_kernel_inv3.global_work_size(1,sizePad/4);my_kernel_inv3.global_work_size(2, pp/2); my_kernel_inv3.local_work_size(0,32);my_kernel_inv3.local_work_size(1,32/4);my_kernel_inv3.local_work_size(2,1); viennacl::ocl::enqueue(my_kernel_inv2(vcl_L, vcl_sizes, vcl_temp2, static_cast(repeat), static_cast(remainder), static_cast(part_size_fixed), static_cast(vcl_L.size2()),static_cast(vcl_L.internal_size2()))); viennacl::ocl::enqueue(my_kernel_inv3(vcl_L, vcl_sizes, vcl_temp2, static_cast(repeat), static_cast(remainder), static_cast(part_size_fixed), static_cast(vcl_L.size2()),static_cast(vcl_L.internal_size2()))); repeat*=2; } vcl_Lbar = viennacl::linalg::prod(vcl_L, vcl_Lbar); vcl_Lbar = viennacl::linalg::prod(vcl_L, viennacl::trans(vcl_Lbar)); viennacl::trans(vcl_Lbar); my_kernel_diag.global_work_size(0,vcl_Lbar.internal_size1()); my_kernel_diag.global_work_size(1,vcl_Lbar.internal_size2()); my_kernel_diag.local_work_size(0,32);my_kernel_diag.local_work_size(1,32); viennacl::ocl::enqueue(my_kernel_diag(vcl_Lbar, static_cast(vcl_Lbar.size1()), static_cast(vcl_Lbar.size2()), static_cast(vcl_Lbar.internal_size2()))); viennacl::copy(vcl_Lbar, Lbar); pos = 0; for (size_type j = 0; j < M_; ++j) for (size_type i = j; i < M_; ++i) variRefA_[pos++]->adj_ += Lbar.coeffRef(i, j); }