@@ -3352,8 +3352,8 @@ integer(qmckl_exit_code) function qmckl_compute_jastrow_champ_delta_p_gl_doc( &
3352
3352
3353
3353
do nw=1, walk_num
3354
3354
do m=1, cord_num-1
3355
- do j = 1, elec_num
3356
- do k = 1, 4
3355
+ do k = 1, 4
3356
+ do j = 1, elec_num
3357
3357
delta_e_gl(j,k) = een_rescaled_single_e_gl(k,j,m,nw) - een_rescaled_e_gl(num, k, j, m, nw)
3358
3358
end do
3359
3359
end do
@@ -3371,7 +3371,7 @@ integer(qmckl_exit_code) function qmckl_compute_jastrow_champ_delta_p_gl_doc( &
3371
3371
cummu = 0.0d0
3372
3372
do i = 1, elec_num
3373
3373
3374
- delta_p_gl(i,a,k,l,m,nw) = -een_rescaled_e_gl(i,k,num,m,nw) * een_re_n&
3374
+ delta_p_gl(i,a,k,l,m,nw) = -een_rescaled_e_gl(i,k,num,m,nw) * een_re_n &
3375
3375
- een_rescaled_single_e_gl(k,i,m,nw) * een_re_single_n
3376
3376
3377
3377
cummu = cummu + delta_e_gl(i,k) * een_rescaled_n(i,a,l,nw)
@@ -3790,27 +3790,12 @@ integer(qmckl_exit_code) function qmckl_compute_jastrow_champ_factor_single_een_
3790
3790
do a = 1, nucl_num
3791
3791
cn = c_vector_full(a, n)
3792
3792
if(cn == 0.d0) cycle
3793
- !do i = 1, elec_num
3794
- ! delta_een_gl(i,kk,nw) = delta_een_gl(i,kk,nw) + ( &
3795
- ! delta_p_gl(i,a,kk,m ,k,nw) * een_rescaled_n(i,a,m+l,nw) + &
3796
- ! delta_p_gl(i,a,kk,m+l,k,nw) * een_rescaled_n(i,a,m ,nw) + &
3797
- ! delta_p(i,a,m ,k,nw) * een_rescaled_n_gl(i,kk,a,m+l,nw) + &
3798
- ! delta_p(i,a,m+l,k,nw) * een_rescaled_n_gl(i,kk,a,m ,nw) ) * cn
3799
- !end do
3800
3793
do i = 1, elec_num
3801
- ! Cache repeated accesses
3802
- dpg1_m = delta_p_gl(i,a,kk,m ,k,nw)
3803
- dpg1_ml = delta_p_gl(i,a,kk,m+l,k,nw)
3804
- dp_m = delta_p(i,a,m ,k,nw)
3805
- dp_ml = delta_p(i,a,m+l,k,nw)
3806
-
3807
- een_r_m = een_rescaled_n(i,a,m ,nw)
3808
- een_r_ml = een_rescaled_n(i,a,m+l,nw)
3809
- een_r_gl_m = een_rescaled_n_gl(i,kk,a,m ,nw)
3810
- een_r_gl_ml = een_rescaled_n_gl(i,kk,a,m+l,nw)
3811
-
3812
- delta_een_gl(i,kk,nw) = delta_een_gl(i,kk,nw) + cn * &
3813
- (dpg1_m * een_r_ml + dpg1_ml * een_r_m + dp_m * een_r_gl_ml + dp_ml * een_r_gl_m)
3794
+ delta_een_gl(i,kk,nw) = delta_een_gl(i,kk,nw) + ( &
3795
+ delta_p_gl(i,a,kk,m ,k,nw) * een_rescaled_n(i,a,m+l,nw) + &
3796
+ delta_p_gl(i,a,kk,m+l,k,nw) * een_rescaled_n(i,a,m ,nw) + &
3797
+ delta_p(i,a,m ,k,nw) * een_rescaled_n_gl(i,kk,a,m+l,nw) + &
3798
+ delta_p(i,a,m+l,k,nw) * een_rescaled_n_gl(i,kk,a,m ,nw) ) * cn
3814
3799
end do
3815
3800
3816
3801
delta_een_gl(num,kk,nw) = delta_een_gl(num,kk,nw) + ( &
@@ -3856,17 +3841,17 @@ qmckl_compute_jastrow_champ_factor_single_een_gl_hpc (const qmckl_context contex
3856
3841
const int64_t nucl_num,
3857
3842
const int64_t cord_num,
3858
3843
const int64_t dim_c_vector,
3859
- const double* c_vector_full,
3860
- const int64_t* lkpm_combined_index,
3861
- const double* tmp_c,
3862
- const double* dtmp_c,
3863
- const double* delta_p,
3864
- const double* delta_p_gl,
3865
- const double* een_rescaled_n,
3866
- const double* een_rescaled_single_n,
3867
- const double* een_rescaled_n_gl,
3868
- const double* een_rescaled_single_n_gl,
3869
- double* const delta_een_gl )
3844
+ const double* restrict c_vector_full,
3845
+ const int64_t* restrict lkpm_combined_index,
3846
+ const double* restrict tmp_c,
3847
+ const double* restrict dtmp_c,
3848
+ const double* restrict delta_p,
3849
+ const double* restrict delta_p_gl,
3850
+ const double* restrict een_rescaled_n,
3851
+ const double* restrict een_rescaled_single_n,
3852
+ const double* restrict een_rescaled_n_gl,
3853
+ const double* restrict een_rescaled_single_n_gl,
3854
+ double* restrict const delta_een_gl )
3870
3855
{
3871
3856
3872
3857
@@ -3884,7 +3869,9 @@ qmckl_compute_jastrow_champ_factor_single_een_gl_hpc (const qmckl_context contex
3884
3869
return QMCKL_SUCCESS;
3885
3870
}
3886
3871
3872
+ #ifdef HAVE_OPENMP
3887
3873
#pragma omp parallel for
3874
+ #endif
3888
3875
for (int64_t nw=0 ; nw<walk_num ; nw++) {
3889
3876
for (size_t i=0 ; i<4*elec_num ; ++i) {
3890
3877
delta_een_gl[i+nw*4*elec_num] = 0.;
@@ -3905,21 +3892,24 @@ qmckl_compute_jastrow_champ_factor_single_een_gl_hpc (const qmckl_context contex
3905
3892
const int64_t m = lkpm_combined_index[n+3*dim_c_vector];
3906
3893
3907
3894
for (int64_t kk=0 ; kk<4 ; ++kk) {
3908
- double* dgl = &delta_een_gl[elec_num*(kk+4*nw)];
3895
+ double* restrict dgl = &delta_een_gl[elec_num*(kk+4*nw)];
3909
3896
3910
3897
for (int64_t a=0 ; a<nucl_num ; ++a) {
3911
3898
const double cn = c_vector_full[a+n*nucl_num];
3912
3899
if (cn == 0.) continue;
3913
3900
3914
- const double* dpg1_m = &delta_p_gl[elec_num*(a+nucl_num*(kk+4*(m+(cord_num+1)*(k+cord_num*nw))))];
3915
- const double* dpg1_ml = &delta_p_gl[elec_num*(a+nucl_num*(kk+4*(m+l+(cord_num+1)*(k+cord_num*nw))))];
3916
- const double* dp_m = &delta_p[elec_num*(a+nucl_num*(m+(cord_num+1)*(k+cord_num*nw)))];
3917
- const double* dp_ml = &delta_p[elec_num*(a+nucl_num*(m+l+(cord_num+1)*(k+cord_num*nw)))];
3918
- const double* een_r_m = &een_rescaled_n[elec_num*(a+nucl_num*(m+(cord_num+1)*nw))];
3919
- const double* een_r_ml = &een_rescaled_n[elec_num*(a+nucl_num*(m+l+(cord_num+1)*nw))];
3920
- const double* een_r_gl_m = &een_rescaled_n_gl[elec_num*(kk+4*(a+nucl_num*(m+(cord_num+1)*nw)))];
3921
- const double* een_r_gl_ml = &een_rescaled_n_gl[elec_num*(kk+4*(a+nucl_num*(m+l+(cord_num+1)*nw)))];
3922
-
3901
+ const double* restrict dpg1_m = &delta_p_gl[elec_num*(a+nucl_num*(kk+4*(m+(cord_num+1)*(k+cord_num*nw))))];
3902
+ const double* restrict dpg1_ml = &delta_p_gl[elec_num*(a+nucl_num*(kk+4*(m+l+(cord_num+1)*(k+cord_num*nw))))];
3903
+ const double* restrict dp_m = &delta_p[elec_num*(a+nucl_num*(m+(cord_num+1)*(k+cord_num*nw)))];
3904
+ const double* restrict dp_ml = &delta_p[elec_num*(a+nucl_num*(m+l+(cord_num+1)*(k+cord_num*nw)))];
3905
+ const double* restrict een_r_m = &een_rescaled_n[elec_num*(a+nucl_num*(m+(cord_num+1)*nw))];
3906
+ const double* restrict een_r_ml = &een_rescaled_n[elec_num*(a+nucl_num*(m+l+(cord_num+1)*nw))];
3907
+ const double* restrict een_r_gl_m = &een_rescaled_n_gl[elec_num*(kk+4*(a+nucl_num*(m+(cord_num+1)*nw)))];
3908
+ const double* restrict een_r_gl_ml = &een_rescaled_n_gl[elec_num*(kk+4*(a+nucl_num*(m+l+(cord_num+1)*nw)))];
3909
+
3910
+ #ifdef HAVE_OPENMP
3911
+ #pragma omp simd
3912
+ #endif
3923
3913
for (int64_t i=0 ; i<elec_num ; ++i) {
3924
3914
dgl[i] += cn * (dpg1_m[i] * een_r_ml[i] + dpg1_ml[i] * een_r_m[i] +
3925
3915
dp_m[i] * een_r_gl_ml[i] + dp_ml[i] * een_r_gl_m[i]);
@@ -3944,21 +3934,24 @@ qmckl_compute_jastrow_champ_factor_single_een_gl_hpc (const qmckl_context contex
3944
3934
const double cn = 2. * c_vector_full[a+n*nucl_num];
3945
3935
if (cn == 0.) continue;
3946
3936
3947
- double* dgl4 = &delta_een_gl[elec_num*(3+4*nw)];
3937
+ double* restrict dgl4 = &delta_een_gl[elec_num*(3+4*nw)];
3948
3938
3949
- const double* dpg1_m = &delta_p_gl[elec_num*(a+nucl_num*(0+4*(m+(cord_num+1)*(k+cord_num*nw))))];
3950
- const double* dpg2_m = &delta_p_gl[elec_num*(a+nucl_num*(1+4*(m+(cord_num+1)*(k+cord_num*nw))))];
3951
- const double* dpg3_m = &delta_p_gl[elec_num*(a+nucl_num*(2+4*(m+(cord_num+1)*(k+cord_num*nw))))];
3952
- const double* dpg1_ml = &delta_p_gl[elec_num*(a+nucl_num*(0+4*(m+l+(cord_num+1)*(k+cord_num*nw))))];
3953
- const double* dpg2_ml = &delta_p_gl[elec_num*(a+nucl_num*(1+4*(m+l+(cord_num+1)*(k+cord_num*nw))))];
3954
- const double* dpg3_ml = &delta_p_gl[elec_num*(a+nucl_num*(2+4*(m+l+(cord_num+1)*(k+cord_num*nw))))];
3955
- const double* een_r_gl1_m = &een_rescaled_n_gl[elec_num*(0+4*(a+nucl_num*(m+(cord_num+1)*nw)))];
3956
- const double* een_r_gl2_m = &een_rescaled_n_gl[elec_num*(1+4*(a+nucl_num*(m+(cord_num+1)*nw)))];
3957
- const double* een_r_gl3_m = &een_rescaled_n_gl[elec_num*(2+4*(a+nucl_num*(m+(cord_num+1)*nw)))];
3958
- const double* een_r_gl1_ml = &een_rescaled_n_gl[elec_num*(0+4*(a+nucl_num*(m+l+(cord_num+1)*nw)))];
3959
- const double* een_r_gl2_ml = &een_rescaled_n_gl[elec_num*(1+4*(a+nucl_num*(m+l+(cord_num+1)*nw)))];
3960
- const double* een_r_gl3_ml = &een_rescaled_n_gl[elec_num*(2+4*(a+nucl_num*(m+l+(cord_num+1)*nw)))];
3961
-
3939
+ const double* restrict dpg1_m = &delta_p_gl[elec_num*(a+nucl_num*(0+4*(m+(cord_num+1)*(k+cord_num*nw))))];
3940
+ const double* restrict dpg2_m = &delta_p_gl[elec_num*(a+nucl_num*(1+4*(m+(cord_num+1)*(k+cord_num*nw))))];
3941
+ const double* restrict dpg3_m = &delta_p_gl[elec_num*(a+nucl_num*(2+4*(m+(cord_num+1)*(k+cord_num*nw))))];
3942
+ const double* restrict dpg1_ml = &delta_p_gl[elec_num*(a+nucl_num*(0+4*(m+l+(cord_num+1)*(k+cord_num*nw))))];
3943
+ const double* restrict dpg2_ml = &delta_p_gl[elec_num*(a+nucl_num*(1+4*(m+l+(cord_num+1)*(k+cord_num*nw))))];
3944
+ const double* restrict dpg3_ml = &delta_p_gl[elec_num*(a+nucl_num*(2+4*(m+l+(cord_num+1)*(k+cord_num*nw))))];
3945
+ const double* restrict een_r_gl1_m = &een_rescaled_n_gl[elec_num*(0+4*(a+nucl_num*(m+(cord_num+1)*nw)))];
3946
+ const double* restrict een_r_gl2_m = &een_rescaled_n_gl[elec_num*(1+4*(a+nucl_num*(m+(cord_num+1)*nw)))];
3947
+ const double* restrict een_r_gl3_m = &een_rescaled_n_gl[elec_num*(2+4*(a+nucl_num*(m+(cord_num+1)*nw)))];
3948
+ const double* restrict een_r_gl1_ml = &een_rescaled_n_gl[elec_num*(0+4*(a+nucl_num*(m+l+(cord_num+1)*nw)))];
3949
+ const double* restrict een_r_gl2_ml = &een_rescaled_n_gl[elec_num*(1+4*(a+nucl_num*(m+l+(cord_num+1)*nw)))];
3950
+ const double* restrict een_r_gl3_ml = &een_rescaled_n_gl[elec_num*(2+4*(a+nucl_num*(m+l+(cord_num+1)*nw)))];
3951
+
3952
+ #ifdef HAVE_OPENMP
3953
+ #pragma omp simd
3954
+ #endif
3962
3955
for (int64_t i=0 ; i<elec_num ; ++i) {
3963
3956
dgl4[i] += (dpg1_m[i] * een_r_gl1_ml[i] + dpg1_ml[i] * een_r_gl1_m[i] +
3964
3957
dpg2_m[i] * een_r_gl2_ml[i] + dpg2_ml[i] * een_r_gl2_m[i] +
0 commit comments