NFFT 3.5.3alpha
fpt.c
Go to the documentation of this file.
1/*
2 * Copyright (c) 2002, 2017 Jens Keiner, Stefan Kunis, Daniel Potts
3 *
4 * This program is free software; you can redistribute it and/or modify it under
5 * the terms of the GNU General Public License as published by the Free Software
6 * Foundation; either version 2 of the License, or (at your option) any later
7 * version.
8 *
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
12 * details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 51
16 * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 */
18
25#include "config.h"
26
27#include <stdlib.h>
28#include <string.h>
29#include <stdbool.h>
30#include <math.h>
31#ifdef HAVE_COMPLEX_H
32#include <complex.h>
33#endif
34
35#include "nfft3.h"
36#include "infft.h"
37
43#undef FPT_CLENSHAW_USE_ONLY_LONG_DOUBLE
44
45/* Macros for index calculation. */
46
48#define K_START_TILDE(x,y) (MAX(MIN(x,y-2),0))
49
51#define K_END_TILDE(x,y) MIN(x,y-1)
52
54#define FIRST_L(x,y) (LRINT(floor((x)/(double)y)))
55
57#define LAST_L(x,y) (LRINT(ceil(((x)+1)/(double)y))-1)
58
59#define N_TILDE(y) (y-1)
60
61#define IS_SYMMETRIC(x,y,z) (x >= ((y-1.0)/z))
62
63#define FPT_BREAK_EVEN 4
64
65#include "fpt.h"
66
67
68static inline void abuvxpwy(double a, double b, double _Complex* u, double _Complex* x,
69 double* v, double _Complex* y, double* w, int n)
70{
71 int l; double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y;
72 double *v_ptr = v, *w_ptr = w;
73 for (l = 0; l < n; l++)
74 *u_ptr++ = a * (b * (*v_ptr++) * (*x_ptr++) + (*w_ptr++) * (*y_ptr++));
75}
76
77#define ABUVXPWY_SYMMETRIC(NAME,S1,S2) \
78static inline void NAME(double a, double b, double _Complex* u, double _Complex* x, \
79 double* v, double _Complex* y, double* w, int n) \
80{ \
81 const int n2 = n>>1; \
82 int l; double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y; \
83 double *v_ptr = v, *w_ptr = w; \
84 for (l = 0; l < n2; l++) \
85 *u_ptr++ = a * (b * (*v_ptr++) * (*x_ptr++) + (*w_ptr++) * (*y_ptr++)); \
86 v_ptr--; w_ptr--; \
87 for (l = 0; l < n2; l++) \
88 *u_ptr++ = a * (b * S1 * (*v_ptr--) * (*x_ptr++) + S2 * (*w_ptr--) * (*y_ptr++)); \
89}
90
91ABUVXPWY_SYMMETRIC(abuvxpwy_symmetric1,1.0,-1.0)
92ABUVXPWY_SYMMETRIC(abuvxpwy_symmetric2,-1.0,1.0)
93
94#define ABUVXPWY_SYMMETRIC_1(NAME,S1) \
95static inline void NAME(double a, double b, double _Complex* u, double _Complex* x, \
96 double* v, double _Complex* y, int n, double *xx) \
97{ \
98 const int n2 = n>>1; \
99 int l; double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y; \
100 double *v_ptr = v, *xx_ptr = xx; \
101 for (l = 0; l < n2; l++, v_ptr++) \
102 *u_ptr++ = a * (b * (*v_ptr) * (*x_ptr++) + ((*v_ptr)*(1.0+*xx_ptr++)) * (*y_ptr++)); \
103 v_ptr--; \
104 for (l = 0; l < n2; l++, v_ptr--) \
105 *u_ptr++ = a * (b * S1 * (*v_ptr) * (*x_ptr++) + (S1 * (*v_ptr) * (1.0+*xx_ptr++)) * (*y_ptr++)); \
106}
107
108ABUVXPWY_SYMMETRIC_1(abuvxpwy_symmetric1_1,1.0)
109ABUVXPWY_SYMMETRIC_1(abuvxpwy_symmetric1_2,-1.0)
110
111#define ABUVXPWY_SYMMETRIC_2(NAME,S1) \
112static inline void NAME(double a, double b, double _Complex* u, double _Complex* x, \
113 double _Complex* y, double* w, int n, double *xx) \
114{ \
115 const int n2 = n>>1; \
116 int l; double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y; \
117 double *w_ptr = w, *xx_ptr = xx; \
118 for (l = 0; l < n2; l++, w_ptr++) \
119 *u_ptr++ = a * (b * (*w_ptr/(1.0+*xx_ptr++)) * (*x_ptr++) + (*w_ptr) * (*y_ptr++)); \
120 w_ptr--; \
121 for (l = 0; l < n2; l++, w_ptr--) \
122 *u_ptr++ = a * (b * (S1 * (*w_ptr)/(1.0+*xx_ptr++) ) * (*x_ptr++) + S1 * (*w_ptr) * (*y_ptr++)); \
123}
124
125ABUVXPWY_SYMMETRIC_2(abuvxpwy_symmetric2_1,1.0)
126ABUVXPWY_SYMMETRIC_2(abuvxpwy_symmetric2_2,-1.0)
127
128static inline void auvxpwy(double a, double _Complex* u, double _Complex* x, double* v,
129 double _Complex* y, double* w, int n)
130{
131 int l;
132 double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y;
133 double *v_ptr = v, *w_ptr = w;
134 for (l = n; l > 0; l--)
135 *u_ptr++ = a * ((*v_ptr++) * (*x_ptr++) + (*w_ptr++) * (*y_ptr++));
136}
137
138static inline void auvxpwy_symmetric(double a, double _Complex* u, double _Complex* x,
139 double* v, double _Complex* y, double* w, int n)
140{
141 const int n2 = n>>1; \
142 int l;
143 double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y;
144 double *v_ptr = v, *w_ptr = w;
145 for (l = n2; l > 0; l--)
146 *u_ptr++ = a * ((*v_ptr++) * (*x_ptr++) + (*w_ptr++) * (*y_ptr++));
147 v_ptr--; w_ptr--;
148 for (l = n2; l > 0; l--)
149 *u_ptr++ = a * ((*v_ptr--) * (*x_ptr++) - (*w_ptr--) * (*y_ptr++));
150}
151
152static inline void auvxpwy_symmetric_1(double a, double _Complex* u, double _Complex* x,
153 double* v, double _Complex* y, double* w, int n, double *xx)
154{
155 const int n2 = n>>1; \
156 int l;
157 double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y;
158 double *v_ptr = v, *w_ptr = w, *xx_ptr = xx;
159 for (l = n2; l > 0; l--, xx_ptr++)
160 *u_ptr++ = a * (((*v_ptr++)*(1.0+*xx_ptr)) * (*x_ptr++) + ((*w_ptr++)*(1.0+*xx_ptr)) * (*y_ptr++));
161 v_ptr--; w_ptr--;
162 for (l = n2; l > 0; l--, xx_ptr++)
163 *u_ptr++ = a * (-((*v_ptr--)*(1.0+*xx_ptr)) * (*x_ptr++) + ((*w_ptr--)*(1.0+*xx_ptr)) * (*y_ptr++));
164}
165
166static inline void auvxpwy_symmetric_2(double a, double _Complex* u, double _Complex* x,
167 double* v, double _Complex* y, double* w, int n, double *xx)
168{
169 const int n2 = n>>1; \
170 int l;
171 double _Complex *u_ptr = u, *x_ptr = x, *y_ptr = y;
172 double *v_ptr = v, *w_ptr = w, *xx_ptr = xx;
173 for (l = n2; l > 0; l--, xx_ptr++)
174 *u_ptr++ = a * (((*v_ptr++)/(1.0+*xx_ptr)) * (*x_ptr++) + ((*w_ptr++)/(1.0+*xx_ptr)) * (*y_ptr++));
175 v_ptr--; w_ptr--;
176 for (l = n2; l > 0; l--, xx_ptr++)
177 *u_ptr++ = a * (-((*v_ptr--)/(1.0+*xx_ptr)) * (*x_ptr++) + ((*w_ptr--)/(1.0+*xx_ptr)) * (*y_ptr++));
178}
179
180#define FPT_DO_STEP(NAME,M1_FUNCTION,M2_FUNCTION) \
181static inline void NAME(double _Complex *a, double _Complex *b, double *a11, double *a12, \
182 double *a21, double *a22, double g, int tau, fpt_set set) \
183{ \
184 \
185 int length = 1<<(tau+1); \
186 \
187 double norm = 1.0/(length<<1); \
188 \
189 /* Compensate for factors introduced by a raw DCT-III. */ \
190 a[0] *= 2.0; \
191 b[0] *= 2.0; \
192 \
193 /* Compute function values from Chebyshev-coefficients using a DCT-III. */ \
194 fftw_execute_r2r(set->plans_dct3[tau-1],(double*)a,(double*)a); \
195 fftw_execute_r2r(set->plans_dct3[tau-1],(double*)b,(double*)b); \
196 \
197 /*for (k = 0; k < length; k++)*/ \
198 /*{*/ \
199 /*fprintf(stderr,"fpt_do_step: a11 = %le, a12 = %le, a21 = %le, a22 = %le\n",*/ \
200 /* a11[k],a12[k],a21[k],a22[k]);*/ \
201 /*}*/ \
202 \
203 /* Check, if gamma is zero. */ \
204 if (g == 0.0) \
205 { \
206 /*fprintf(stderr,"gamma == 0!\n");*/ \
207 /* Perform multiplication only for second row. */ \
208 M2_FUNCTION(norm,b,b,a22,a,a21,length); \
209 } \
210 else \
211 { \
212 /*fprintf(stderr,"gamma != 0!\n");*/ \
213 /* Perform multiplication for both rows. */ \
214 M2_FUNCTION(norm,set->z,b,a22,a,a21,length); \
215 M1_FUNCTION(norm*g,a,a,a11,b,a12,length); \
216 memcpy(b,set->z,length*sizeof(double _Complex)); \
217 /* Compute Chebyshev-coefficients using a DCT-II. */ \
218 fftw_execute_r2r(set->plans_dct2[tau-1],(double*)a,(double*)a); \
219 /* Compensate for factors introduced by a raw DCT-II. */ \
220 a[0] *= 0.5; \
221 } \
222 \
223 /* Compute Chebyshev-coefficients using a DCT-II. */ \
224 fftw_execute_r2r(set->plans_dct2[tau-1],(double*)b,(double*)b); \
225 /* Compensate for factors introduced by a raw DCT-II. */ \
226 b[0] *= 0.5; \
227}
228
229FPT_DO_STEP(fpt_do_step,auvxpwy,auvxpwy)
230FPT_DO_STEP(fpt_do_step_symmetric,auvxpwy_symmetric,auvxpwy_symmetric)
231/*FPT_DO_STEP(fpt_do_step_symmetric_u,auvxpwy_symmetric,auvxpwy)
232FPT_DO_STEP(fpt_do_step_symmetric_l,auvxpwy,auvxpwy_symmetric)*/
233
234static inline void fpt_do_step_symmetric_u(double _Complex *a, double _Complex *b,
235 double *a11, double *a12, double *a21, double *a22, double *x,
236 double gam, int tau, fpt_set set)
237{
239 int length = 1<<(tau+1);
241 double norm = 1.0/(length<<1);
242
243 UNUSED(a21); UNUSED(a22);
244
245 /* Compensate for factors introduced by a raw DCT-III. */
246 a[0] *= 2.0;
247 b[0] *= 2.0;
248
249 /* Compute function values from Chebyshev-coefficients using a DCT-III. */
250 fftw_execute_r2r(set->plans_dct3[tau-1],(double*)a,(double*)a);
251 fftw_execute_r2r(set->plans_dct3[tau-1],(double*)b,(double*)b);
252
253 /*for (k = 0; k < length; k++)*/
254 /*{*/
255 /*fprintf(stderr,"fpt_do_step: a11 = %le, a12 = %le, a21 = %le, a22 = %le\n",*/
256 /* a11[k],a12[k],a21[k],a22[k]);*/
257 /*}*/
258
259 /* Check, if gamma is zero. */
260 if (gam == 0.0)
261 {
262 /*fprintf(stderr,"gamma == 0!\n");*/
263 /* Perform multiplication only for second row. */
264 auvxpwy_symmetric_1(norm,b,b,a12,a,a11,length,x);
265 }
266 else
267 {
268 /*fprintf(stderr,"gamma != 0!\n");*/
269 /* Perform multiplication for both rows. */
270 auvxpwy_symmetric_1(norm,set->z,b,a12,a,a11,length,x);
271 auvxpwy_symmetric(norm*gam,a,a,a11,b,a12,length);
272 memcpy(b,set->z,length*sizeof(double _Complex));
273 /* Compute Chebyshev-coefficients using a DCT-II. */
274 fftw_execute_r2r(set->plans_dct2[tau-1],(double*)a,(double*)a);
275 /* Compensate for factors introduced by a raw DCT-II. */
276 a[0] *= 0.5;
277 }
278
279 /* Compute Chebyshev-coefficients using a DCT-II. */
280 fftw_execute_r2r(set->plans_dct2[tau-1],(double*)b,(double*)b);
281 /* Compensate for factors introduced by a raw DCT-II. */
282 b[0] *= 0.5;
283}
284
285static inline void fpt_do_step_symmetric_l(double _Complex *a, double _Complex *b,
286 double *a11, double *a12, double *a21, double *a22, double *x, double gam, int tau, fpt_set set)
287{
289 int length = 1<<(tau+1);
291 double norm = 1.0/(length<<1);
292
293 /* Compensate for factors introduced by a raw DCT-III. */
294 a[0] *= 2.0;
295 b[0] *= 2.0;
296
297 UNUSED(a11); UNUSED(a12);
298
299 /* Compute function values from Chebyshev-coefficients using a DCT-III. */
300 fftw_execute_r2r(set->plans_dct3[tau-1],(double*)a,(double*)a);
301 fftw_execute_r2r(set->plans_dct3[tau-1],(double*)b,(double*)b);
302
303 /*for (k = 0; k < length; k++)*/
304 /*{*/
305 /*fprintf(stderr,"fpt_do_step: a11 = %le, a12 = %le, a21 = %le, a22 = %le\n",*/
306 /* a11[k],a12[k],a21[k],a22[k]);*/
307 /*}*/
308
309 /* Check, if gamma is zero. */
310 if (gam == 0.0)
311 {
312 /*fprintf(stderr,"gamma == 0!\n");*/
313 /* Perform multiplication only for second row. */
314 auvxpwy_symmetric(norm,b,b,a22,a,a21,length);
315 }
316 else
317 {
318 /*fprintf(stderr,"gamma != 0!\n");*/
319 /* Perform multiplication for both rows. */
320 auvxpwy_symmetric(norm,set->z,b,a22,a,a21,length);
321 auvxpwy_symmetric_2(norm*gam,a,a,a21,b,a22,length,x);
322 memcpy(b,set->z,length*sizeof(double _Complex));
323 /* Compute Chebyshev-coefficients using a DCT-II. */
324 fftw_execute_r2r(set->plans_dct2[tau-1],(double*)a,(double*)a);
325 /* Compensate for factors introduced by a raw DCT-II. */
326 a[0] *= 0.5;
327 }
328
329 /* Compute Chebyshev-coefficients using a DCT-II. */
330 fftw_execute_r2r(set->plans_dct2[tau-1],(double*)b,(double*)b);
331 /* Compensate for factors introduced by a raw DCT-II. */
332 b[0] *= 0.5;
333}
334
335#define FPT_DO_STEP_TRANSPOSED(NAME,M1_FUNCTION,M2_FUNCTION) \
336static inline void NAME(double _Complex *a, double _Complex *b, double *a11, \
337 double *a12, double *a21, double *a22, double g, int tau, fpt_set set) \
338{ \
339 \
340 int length = 1<<(tau+1); \
341 \
342 double norm = 1.0/(length<<1); \
343 \
344 /* Compute function values from Chebyshev-coefficients using a DCT-III. */ \
345 fftw_execute_r2r(set->plans_dct3[tau-1],(double*)a,(double*)a); \
346 fftw_execute_r2r(set->plans_dct3[tau-1],(double*)b,(double*)b); \
347 \
348 /* Perform matrix multiplication. */ \
349 M1_FUNCTION(norm,g,set->z,a,a11,b,a21,length); \
350 M2_FUNCTION(norm,g,b,a,a12,b,a22,length); \
351 memcpy(a,set->z,length*sizeof(double _Complex)); \
352 \
353 /* Compute Chebyshev-coefficients using a DCT-II. */ \
354 fftw_execute_r2r(set->plans_dct2[tau-1],(double*)a,(double*)a); \
355 fftw_execute_r2r(set->plans_dct2[tau-1],(double*)b,(double*)b); \
356}
357
358FPT_DO_STEP_TRANSPOSED(fpt_do_step_t,abuvxpwy,abuvxpwy)
359FPT_DO_STEP_TRANSPOSED(fpt_do_step_t_symmetric,abuvxpwy_symmetric1,abuvxpwy_symmetric2)
360/*FPT_DO_STEP_TRANSPOSED(fpt_do_step_t_symmetric_u,abuvxpwy_symmetric1_1,abuvxpwy_symmetric1_2)*/
361/*FPT_DO_STEP_TRANSPOSED(fpt_do_step_t_symmetric_l,abuvxpwy_symmetric2_2,abuvxpwy_symmetric2_1)*/
362
363
364static inline void fpt_do_step_t_symmetric_u(double _Complex *a,
365 double _Complex *b, double *a11, double *a12, double *x,
366 double gam, int tau, fpt_set set)
367{
369 int length = 1<<(tau+1);
371 double norm = 1.0/(length<<1);
372
373 /* Compute function values from Chebyshev-coefficients using a DCT-III. */
374 fftw_execute_r2r(set->plans_dct3[tau-1],(double*)a,(double*)a);
375 fftw_execute_r2r(set->plans_dct3[tau-1],(double*)b,(double*)b);
376
377 /* Perform matrix multiplication. */
378 abuvxpwy_symmetric1_1(norm,gam,set->z,a,a11,b,length,x);
379 abuvxpwy_symmetric1_2(norm,gam,b,a,a12,b,length,x);
380 memcpy(a,set->z,length*sizeof(double _Complex));
381
382 /* Compute Chebyshev-coefficients using a DCT-II. */
383 fftw_execute_r2r(set->plans_dct2[tau-1],(double*)a,(double*)a);
384 fftw_execute_r2r(set->plans_dct2[tau-1],(double*)b,(double*)b);
385}
386
387static inline void fpt_do_step_t_symmetric_l(double _Complex *a,
388 double _Complex *b, double *a21, double *a22,
389 double *x, double gam, int tau, fpt_set set)
390{
392 int length = 1<<(tau+1);
394 double norm = 1.0/(length<<1);
395
396 /* Compute function values from Chebyshev-coefficients using a DCT-III. */
397 fftw_execute_r2r(set->plans_dct3[tau-1],(double*)a,(double*)a);
398 fftw_execute_r2r(set->plans_dct3[tau-1],(double*)b,(double*)b);
399
400 /* Perform matrix multiplication. */
401 abuvxpwy_symmetric2_2(norm,gam,set->z,a,b,a21,length,x);
402 abuvxpwy_symmetric2_1(norm,gam,b,a,b,a22,length,x);
403 memcpy(a,set->z,length*sizeof(double _Complex));
404
405 /* Compute Chebyshev-coefficients using a DCT-II. */
406 fftw_execute_r2r(set->plans_dct2[tau-1],(double*)a,(double*)a);
407 fftw_execute_r2r(set->plans_dct2[tau-1],(double*)b,(double*)b);
408}
409
410
411static void eval_clenshaw(const double *x, double *y, int size, int k, const double *alpha,
412 const double *beta, const double *gam)
413{
414 /* Evaluate the associated Legendre polynomial P_{k,nleg} (l,x) for the vector
415 * of knots x[0], ..., x[size-1] by the Clenshaw algorithm
416 */
417 int i,j;
418 const double *x_act;
419 double *y_act;
420 const double *alpha_act, *beta_act, *gamma_act;
421
422 /* Traverse all nodes. */
423 x_act = x;
424 y_act = y;
425 for (i = 0; i < size; i++)
426 {
427 double x_val_act = *x_act;
428
429 if (k == 0)
430 {
431 *y_act = 1.0;
432 }
433 else
434 {
435 alpha_act = &(alpha[k]);
436 beta_act = &(beta[k]);
437 gamma_act = &(gam[k]);
438
439#ifdef FPT_CLENSHAW_USE_ONLY_LONG_DOUBLE
440 long double a = 1.0;
441 long double b = 0.0;
442 for (j = k; j > 1; j--)
443 {
444 long double a_old = a;
445 a = b + a_old*((*alpha_act)*x_val_act+(*beta_act));
446 b = a_old*(*gamma_act);
447 alpha_act--;
448 beta_act--;
449 gamma_act--;
450 }
451 *y_act = (a*((*alpha_act)*x_val_act+(*beta_act))+b);
452#else
453 double a = 1.0;
454 double b = 0.0;
455 /* 1e247 should not trigger for NFSFT with N <= 1024 */
456 for (j = k; j > 1 && fabs(a) < 1e247; j--)
457 {
458 double a_old = a;
459 a = b + a_old*((*alpha_act)*x_val_act+(*beta_act));
460 b = a_old*(*gamma_act);
461 alpha_act--;
462 beta_act--;
463 gamma_act--;
464 }
465 if (j <= 1)
466 *y_act = (a*((*alpha_act)*x_val_act+(*beta_act))+b);
467 else /* fabs(a) >= 1e247, continue in long double */
468 {
469 long double a_ld = a;
470 long double b_ld = b;
471 for (; j > 1; j--)
472 {
473 long double a_old = a_ld;
474 a_ld = b_ld + a_old*((*alpha_act)*x_val_act+(*beta_act));
475 b_ld = a_old*(*gamma_act);
476 alpha_act--;
477 beta_act--;
478 gamma_act--;
479 }
480 *y_act = (a_ld*((*alpha_act)*x_val_act+(*beta_act))+b_ld);
481 }
482#endif
483
484 }
485 x_act++;
486 y_act++;
487 }
488}
489
490static void eval_clenshaw2(const double *x, double *z, double *y, int size1, int size, int k, const double *alpha,
491 const double *beta, const double *gam)
492{
493 /* Evaluate the associated Legendre polynomial P_{k,nleg} (l,x) for the vector
494 * of knots x[0], ..., x[size-1] by the Clenshaw algorithm
495 */
496 int i,j;
497 double a,b,x_val_act,a_old;
498 const double *x_act;
499 double *y_act, *z_act;
500 const double *alpha_act, *beta_act, *gamma_act;
501
502 /* Traverse all nodes. */
503 x_act = x;
504 y_act = y;
505 z_act = z;
506 for (i = 0; i < size; i++)
507 {
508 a = 1.0;
509 b = 0.0;
510 x_val_act = *x_act;
511
512 if (k == 0)
513 {
514 *y_act = 1.0;
515 *z_act = 0.0;
516 }
517 else
518 {
519 alpha_act = &(alpha[k]);
520 beta_act = &(beta[k]);
521 gamma_act = &(gam[k]);
522 for (j = k; j > 1; j--)
523 {
524 a_old = a;
525 a = b + a_old*((*alpha_act)*x_val_act+(*beta_act));
526 b = a_old*(*gamma_act);
527 alpha_act--;
528 beta_act--;
529 gamma_act--;
530 }
531 if (i < size1)
532 *z_act = a;
533 *y_act = (a*((*alpha_act)*x_val_act+(*beta_act))+b);
534 }
535
536 x_act++;
537 y_act++;
538 z_act++;
539 }
540 /*gamma_act++;
541 FILE *f = fopen("/Users/keiner/Desktop/nfsft_debug.txt","a");
542 fprintf(f,"size1: %10d, size: %10d\n",size1,size);
543 fclose(f);*/
544}
545
546static int eval_clenshaw_thresh2(const double *x, double *z, double *y, int size, int k,
547 const double *alpha, const double *beta, const double *gam, const
548 double threshold)
549{
550 /* Evaluate the associated Legendre polynomial P_{k,nleg} (l,x) for the vector
551 * of knots x[0], ..., x[size-1] by the Clenshaw algorithm
552 */
553 int i,j;
554 double x_val_act;
555 const double *x_act;
556 double *y_act, *z_act;
557 const double *alpha_act, *beta_act, *gamma_act;
558 const R threshold_abs = FABS(threshold);
559
560 /* Traverse all nodes. */
561 x_act = x;
562 y_act = y;
563 z_act = z;
564 for (i = 0; i < size; i++)
565 {
566 x_val_act = *x_act;
567
568 if (k == 0)
569 {
570 *y_act = 1.0;
571 *z_act = 0.0;
572 }
573 else
574 {
575 alpha_act = &(alpha[k]);
576 beta_act = &(beta[k]);
577 gamma_act = &(gam[k]);
578
579#ifdef FPT_CLENSHAW_USE_ONLY_LONG_DOUBLE
580 long double a = 1.0;
581 long double b = 0.0;
582 for (j = k; j > 1; j--)
583 {
584 long double a_old = a;
585 a = b + a_old*((*alpha_act)*x_val_act+(*beta_act));
586 b = a_old*(*gamma_act);
587 alpha_act--;
588 beta_act--;
589 gamma_act--;
590 }
591 *z_act = a;
592 *y_act = (a*((*alpha_act)*x_val_act+(*beta_act))+b);
593#else
594 double a = 1.0;
595 double b = 0.0;
596 for (j = k; j > 1 && fabs(a) < 1e247; j--)
597 {
598 double a_old = a;
599 a = b + a_old*((*alpha_act)*x_val_act+(*beta_act));
600 b = a_old*(*gamma_act);
601 alpha_act--;
602 beta_act--;
603 gamma_act--;
604 }
605 if (j <= 1)
606 {
607 *z_act = a;
608 *y_act = (a*((*alpha_act)*x_val_act+(*beta_act))+b);
609 }
610 else /* fabs(a) >= 1e247, continue in long double */
611 {
612 long double a_ld = a;
613 long double b_ld = b;
614 for (; j > 1; j--)
615 {
616 long double a_old = a_ld;
617 a_ld = b_ld + a_old*((*alpha_act)*x_val_act+(*beta_act));
618 b_ld = a_old*(*gamma_act);
619 alpha_act--;
620 beta_act--;
621 gamma_act--;
622 }
623 *z_act = a_ld;
624 *y_act = (a_ld*((*alpha_act)*x_val_act+(*beta_act))+b_ld);
625 }
626#endif
627 if (FABS(*y_act) > threshold_abs)
628 return 1;
629 }
630 x_act++;
631 y_act++;
632 z_act++;
633 }
634 return 0;
635}
636
637static inline void eval_sum_clenshaw_fast(const int N, const int M,
638 const double _Complex *a, const double *x, double _Complex *y,
639 const double *alpha, const double *beta, const double *gam,
640 const double lambda)
641{
642 int j,k;
643
644 if (N == 0)
645 for (j = 0; j <= M; j++)
646 y[j] = a[0];
647 else
648 {
649 for (j = 0; j <= M; j++)
650 {
651 double xc = x[j];
652#ifdef FPT_CLENSHAW_USE_ONLY_LONG_DOUBLE
653 long double _Complex tmp1 = a[N-1];
654 long double _Complex tmp2 = a[N];
655 for (k = N-1; k > 0; k--)
656 {
657 long double _Complex tmp3 = a[k-1] + tmp2 * gam[k];
658 tmp2 *= (alpha[k] * xc + beta[k]);
659 tmp2 += tmp1;
660 tmp1 = tmp3;
661 }
662 tmp2 *= (alpha[0] * xc + beta[0]);
663 y[j] = lambda * (tmp2 + tmp1);
664#else
665 double _Complex tmp1 = a[N-1];
666 double _Complex tmp2 = a[N];
667 /* 1e247 should not trigger for NFSFT with N <= 1024 */
668 for (k = N-1; k > 0 && fabs(creal(tmp2)) < 1e247 && fabs(cimag(tmp2)) < 1e247; k--)
669 {
670 double _Complex tmp3 = a[k-1] + tmp2 * gam[k];
671 tmp2 *= (alpha[k] * xc + beta[k]);
672 tmp2 += tmp1;
673 tmp1 = tmp3;
674 }
675 if (k <= 0)
676 {
677 tmp2 *= (alpha[0] * xc + beta[0]);
678 y[j] = lambda * (tmp2 + tmp1);
679 }
680 else /* fabs(tmp2) >= 1e247 */
681 {
682 long double _Complex tmp1_ld = tmp1;
683 long double _Complex tmp2_ld = tmp2;
684 for (; k > 0; k--)
685 {
686 long double _Complex tmp3_ld = a[k-1] + tmp2_ld * gam[k];
687 tmp2_ld *= (alpha[k] * xc + beta[k]);
688 tmp2_ld += tmp1_ld;
689 tmp1_ld = tmp3_ld;
690 }
691 tmp2_ld *= (alpha[0] * xc + beta[0]);
692 y[j] = lambda * (tmp2_ld + tmp1_ld);
693 } /* end fabs(tmp2) >= 1e247 */
694#endif
695 } /* for j */
696 } /* N > 0 */
697}
698
717static void eval_sum_clenshaw_transposed(int N, int M, double _Complex* a, double *x,
718 double _Complex *y, double _Complex *temp, double *alpha, double *beta, double *gam,
719 double lambda)
720{
721 int j,k;
722 double _Complex* it1 = temp;
723 double _Complex* it2 = y;
724 double _Complex aux;
725
726 /* Compute final result by multiplying with the constant lambda */
727 a[0] = 0.0;
728 for (j = 0; j <= M; j++)
729 {
730 it2[j] = lambda * y[j];
731 a[0] += it2[j];
732 }
733
734 if (N > 0)
735 {
736 /* Compute final step. */
737 a[1] = 0.0;
738 for (j = 0; j <= M; j++)
739 {
740 it1[j] = it2[j];
741 it2[j] = it2[j] * (alpha[0] * x[j] + beta[0]);
742 a[1] += it2[j];
743 }
744
745 for (k = 2; k <= N; k++)
746 {
747 a[k] = 0.0;
748 for (j = 0; j <= M; j++)
749 {
750 aux = it1[j];
751 it1[j] = it2[j];
752 it2[j] = it2[j]*(alpha[k-1] * x[j] + beta[k-1]) + gam[k-1] * aux;
753 a[k] += it2[j];
754 }
755 }
756 }
757}
758
759static void eval_sum_clenshaw_transposed_ld(int N, int M, double _Complex* a, double *x,
760 double _Complex *y, double _Complex *temp, double *alpha, double *beta, double *gam,
761 double lambda)
762{
763 int j,k;
764
765 for (k = 0; k <= N; k++)
766 a[k] = 0.0;
767
768 if (N == 0)
769 for (j = 0; j <= M; j++)
770 a[0] += lambda * y[j];
771 else
772 {
773 for (j = 0; j <= M; j++)
774 {
775 /* Compute final result by multiplying with the constant lambda */
776 long double _Complex it2 = lambda * y[j];
777 a[0] += it2;
778
779 /* Compute final step. */
780 long double _Complex it1 = it2;
781 it2 = it2 * (alpha[0] * x[j] + beta[0]);
782 a[1] += it2;
783
784 for (k = 2; k <= N; k++)
785 {
786 long double _Complex aux = it1;
787 it1 = it2;
788 it2 = it2*(alpha[k-1] * x[j] + beta[k-1]) + gam[k-1] * aux;
789 a[k] += it2;
790 }
791 }
792 }
793}
794
795fpt_set fpt_init(const int M, const int t, const unsigned int flags)
796{
798 int plength;
800 int tau;
802 int m;
803 int k;
804#ifdef _OPENMP
805 int nthreads = X(get_num_threads)();
806#endif
807
808 /* Allocate memory for new DPT set. */
809 fpt_set_s *set = (fpt_set_s*)nfft_malloc(sizeof(fpt_set_s));
810
811 /* Save parameters in structure. */
812 set->flags = flags;
813
814 set->M = M;
815 set->t = t;
816 set->N = 1<<t;
817
818 if (!(flags & FPT_NO_INIT_FPT_DATA))
819 {
820 /* Allocate memory for M transforms. */
821 set->dpt = (fpt_data*) nfft_malloc(M*sizeof(fpt_data));
822
823 /* Initialize with NULL pointer. */
824 for (m = 0; m < set->M; m++)
825 {
826 set->dpt[m].steps = NULL;
827 set->dpt[m].precomputed = false;
828 }
829 }
830 else
831 set->dpt = NULL;
832
833 /* Create arrays with Chebyshev nodes. */
834
835 /* Initialize array with Chebyshev coefficients for the polynomial x. This
836 * would be trivially an array containing a 1 as second entry with all other
837 * coefficients set to zero. In order to compensate for the multiplicative
838 * factor 2 introduced by the DCT-III, we set this coefficient to 0.5 here. */
839
840 /* Allocate memory for array of pointers to node arrays. */
841 set->xcvecs = (double**) nfft_malloc((set->t)*sizeof(double*));
842 /* For each polynomial length starting with 4, compute the Chebyshev nodes
843 * using a DCT-III. */
844 plength = 4;
845 for (tau = 1; tau < t+1; tau++)
846 {
847 /* Allocate memory for current array. */
848 set->xcvecs[tau-1] = (double*) nfft_malloc(plength*sizeof(double));
849 for (k = 0; k < plength; k++)
850 {
851 set->xcvecs[tau-1][k] = cos(((k+0.5)*KPI)/plength);
852 }
853 plength = plength << 1;
854 }
855
857 set->work = (double _Complex*) nfft_malloc((2*set->N)*sizeof(double _Complex));
858 set->result = (double _Complex*) nfft_malloc((2*set->N)*sizeof(double _Complex));
859
860 set->plans_dct2 = (fftw_plan*) nfft_malloc(sizeof(fftw_plan)*(set->t/*-1*/));
861 set->kindsr = (fftw_r2r_kind*) nfft_malloc(2*sizeof(fftw_r2r_kind));
862 set->kindsr[0] = FFTW_REDFT10;
863 set->kindsr[1] = FFTW_REDFT10;
864 for (tau = 0, plength = 4; tau < set->t/*-1*/; tau++, plength<<=1)
865 {
866#ifdef _OPENMP
867#pragma omp critical (nfft_omp_critical_fftw_plan)
868{
869 fftw_plan_with_nthreads(nthreads);
870#endif
871 set->plans_dct2[tau] =
872 fftw_plan_many_r2r(1, &plength, 2, (double*)set->work, NULL,
873 2, 1, (double*)set->result, NULL, 2, 1,set->kindsr,
874 0);
875#ifdef _OPENMP
876}
877#endif
878 }
879
881 set->plans_dct3 = (fftw_plan*) nfft_malloc(sizeof(fftw_plan)*(set->t/*-1*/));
882 set->kinds = (fftw_r2r_kind*) nfft_malloc(2*sizeof(fftw_r2r_kind));
883 set->kinds[0] = FFTW_REDFT01;
884 set->kinds[1] = FFTW_REDFT01;
885 for (tau = 0, plength = 4; tau < set->t/*-1*/; tau++, plength<<=1)
886 {
887#ifdef _OPENMP
888#pragma omp critical (nfft_omp_critical_fftw_plan)
889{
890 fftw_plan_with_nthreads(nthreads);
891#endif
892 set->plans_dct3[tau] =
893 fftw_plan_many_r2r(1, &plength, 2, (double*)set->work, NULL,
894 2, 1, (double*)set->result, NULL, 2, 1, set->kinds,
895 0);
896#ifdef _OPENMP
897}
898#endif
899 }
900 nfft_free(set->kinds);
901 nfft_free(set->kindsr);
902 set->kinds = NULL;
903 set->kindsr = NULL;
904
905 set->vec3 = NULL;
906 set->vec4 = NULL;
907 set->z = NULL;
908
909 set->xc_slow = NULL;
910 set->temp = NULL;
911
912 /* Check if fast transform is activated. */
913 if (!(set->flags & FPT_NO_FAST_ALGORITHM))
914 {
916 set->vec3 = (double _Complex*) nfft_malloc(set->N*sizeof(double _Complex));
917 set->vec4 = (double _Complex*) nfft_malloc(set->N*sizeof(double _Complex));
918 set->z = (double _Complex*) nfft_malloc(set->N*sizeof(double _Complex));
919 }
920
921 if (!(set->flags & FPT_NO_DIRECT_ALGORITHM))
922 {
923 set->xc_slow = (double*) nfft_malloc((set->N+1)*sizeof(double));
924 set->temp = (double _Complex*) nfft_malloc((set->N+1)*sizeof(double _Complex));
925
926 if (!(flags & FPT_NO_INIT_FPT_DATA))
927 {
928 for (m = 0; m < set->M; m++)
929 {
930 fpt_data *data = &(set->dpt[m]);
931 data->_alpha = NULL;
932 data->_beta = NULL;
933 data->_gamma = NULL;
934 }
935 }
936 }
937
938 /* Return the newly created DPT set. */
939 return set;
940}
941
942void fpt_precompute_1(fpt_set set, const int m, int k_start)
943{
944 int tau;
945 int l;
946 int plength;
948 int degree;
950 int firstl;
951 int lastl;
952 int k_start_tilde;
953 int N_tilde;
954 int clength;
955 fpt_data *data;
956
957 /* Get pointer to DPT data. */
958 data = &(set->dpt[m]);
959
960 /* Check, if already precomputed. */
961 if (data->steps != NULL)
962 return;
963
964 /* Save k_start. */
965 data->k_start = k_start;
966
967 data->alphaN = NULL;
968 data->betaN = NULL;
969 data->gammaN = NULL;
970
971 if (!(set->flags & FPT_NO_FAST_ALGORITHM))
972 {
973 /* Save recursion coefficients. */
974 data->alphaN = (double*) nfft_malloc(3*(set->t-1)*sizeof(double));
975 data->betaN = data->alphaN + (set->t-1);
976 data->gammaN = data->betaN + (set->t-1);
977
978 k_start_tilde = K_START_TILDE(data->k_start,X(next_power_of_2)(data->k_start)
979 /*set->N*/);
980 N_tilde = N_TILDE(set->N);
981
982 /* Allocate memory for the cascade with t = log_2(N) many levels. */
983 data->steps = (fpt_step**) nfft_malloc(sizeof(fpt_step*)*set->t);
984
985 plength = 4;
986 for (tau = 1; tau < set->t; tau++)
987 {
988 /* Compute auxilliary values. */
989 degree = plength>>1;
990 /* Compute first l. */
991 firstl = FIRST_L(k_start_tilde,plength);
992 /* Compute last l. */
993 lastl = LAST_L(N_tilde,plength);
994
995 /* Allocate memory for current level. This level will contain 2^{t-tau-1}
996 * many matrices. */
997 data->steps[tau] = (fpt_step*) nfft_malloc(sizeof(fpt_step)
998 * (lastl+1));
999
1000 /* For l = 0,...2^{t-tau-1}-1 compute the matrices U_{n,tau,l}. */
1001 for (l = firstl; l <= lastl; l++)
1002 {
1003 if ((set->flags & FPT_AL_SYMMETRY) && IS_SYMMETRIC(l,m,plength))
1004 {
1005 clength = plength/2;
1006 }
1007 else
1008 {
1009 clength = plength;
1010 }
1011
1012 /* Allocate memory for the components of U_{n,tau,l}. */
1013 data->steps[tau][l].a = (double*) nfft_malloc(sizeof(double)*clength*4);
1014 }
1016 plength = plength << 1;
1017 }
1018 }
1019
1020 if (!(set->flags & FPT_NO_DIRECT_ALGORITHM) && !(set->flags & FPT_PERSISTENT_DATA) && (data->_alpha == NULL))
1021 {
1022 data->_alpha = (double*) nfft_malloc(3*(set->N+1)*sizeof(double));
1023 data->_beta = data->_alpha + (set->N+1);
1024 data->_gamma = data->_beta + (set->N+1);
1025 }
1026}
1027
1028void fpt_precompute_2(fpt_set set, const int m, double *alpha, double *beta,
1029 double *gam, int k_start, const double threshold)
1030{
1031
1032 int tau;
1033 int l;
1034 int plength;
1036 int degree;
1038 int firstl;
1039 int lastl;
1040 int plength_stab;
1042 int degree_stab;
1044 /*double *a11;*/
1046 /*double *a12;*/
1048 /*double *a21;*/
1050 /*double *a22;*/
1052 const double *calpha;
1053 const double *cbeta;
1054 const double *cgamma;
1055 int needstab = 0;
1056 int k_start_tilde;
1057 int N_tilde;
1058 int clength;
1059 int clength_1;
1060 int clength_2;
1061 int t_stab, N_stab;
1062 fpt_data *data;
1063
1064 /* Get pointer to DPT data. */
1065 data = &(set->dpt[m]);
1066
1067 /* Check, if already precomputed. */
1068 if ((data->steps != NULL) && (data->precomputed))
1069 return;
1070
1071 /* Save k_start. */
1072 data->k_start = k_start;
1073
1074 data->gamma_m1 = gam[0];
1075/* moved to fpt_precompute_1
1076 data->alphaN = NULL;
1077 data->betaN = NULL;
1078 data->gammaN = NULL;*/
1079
1080 /* Check if fast transform is activated. */
1081 if (!(set->flags & FPT_NO_FAST_ALGORITHM))
1082 {
1083 /* Save recursion coefficients. moved to fpt_precompute_1
1084 data->alphaN = (double*) nfft_malloc((set->t-1)*sizeof(double _Complex));
1085 data->betaN = (double*) nfft_malloc((set->t-1)*sizeof(double _Complex));
1086 data->gammaN = (double*) nfft_malloc((set->t-1)*sizeof(double _Complex)); */
1087
1088 for (tau = 2; tau <= set->t; tau++)
1089 {
1090
1091 data->alphaN[tau-2] = alpha[1<<tau];
1092 data->betaN[tau-2] = beta[1<<tau];
1093 data->gammaN[tau-2] = gam[1<<tau];
1094 }
1095
1096 data->alpha_0 = alpha[1];
1097 data->beta_0 = beta[1];
1098
1099 k_start_tilde = K_START_TILDE(data->k_start,X(next_power_of_2)(data->k_start)
1100 /*set->N*/);
1101 N_tilde = N_TILDE(set->N);
1102
1103 /* Allocate memory for the cascade with t = log_2(N) many levels. moved to fpt_precompute_1
1104 data->steps = (fpt_step**) nfft_malloc(sizeof(fpt_step*)*set->t); */
1105
1106 /* For tau = 1,...t compute the matrices U_{n,tau,l}. */
1107 plength = 4;
1108 for (tau = 1; tau < set->t; tau++)
1109 {
1110 /* Compute auxilliary values. */
1111 degree = plength>>1;
1112 /* Compute first l. */
1113 firstl = FIRST_L(k_start_tilde,plength);
1114 /* Compute last l. */
1115 lastl = LAST_L(N_tilde,plength);
1116
1117 /* Allocate memory for current level. This level will contain 2^{t-tau-1}
1118 * many matrices. moved to fpt_precompute_1
1119 data->steps[tau] = (fpt_step*) nfft_malloc(sizeof(fpt_step)
1120 * (lastl+1)); */
1121
1122 /* For l = 0,...2^{t-tau-1}-1 compute the matrices U_{n,tau,l}. */
1123 for (l = firstl; l <= lastl; l++)
1124 {
1125 if ((set->flags & FPT_AL_SYMMETRY) && IS_SYMMETRIC(l,m,plength))
1126 {
1127 //fprintf(stderr,"fpt_precompute(%d): symmetric step\n",m);
1128 //fflush(stderr);
1129 clength = plength/2;
1130 }
1131 else
1132 {
1133 clength = plength;
1134 }
1135
1136 /* Allocate memory for the components of U_{n,tau,l}. moved to fpt_precompute_1
1137 data->steps[tau][l].a11 = (double*) nfft_malloc(sizeof(double)*clength);
1138 data->steps[tau][l].a12 = (double*) nfft_malloc(sizeof(double)*clength);
1139 data->steps[tau][l].a21 = (double*) nfft_malloc(sizeof(double)*clength);
1140 data->steps[tau][l].a22 = (double*) nfft_malloc(sizeof(double)*clength); */
1141
1142 /* Evaluate the associated polynomials at the 2^{tau+1} Chebyshev
1143 * nodes. */
1144
1145 /* Get the pointers to the three-term recurrence coeffcients. */
1146 calpha = &(alpha[plength*l+1+1]);
1147 cbeta = &(beta[plength*l+1+1]);
1148 cgamma = &(gam[plength*l+1+1]);
1149
1150 double *a11 = data->steps[tau][l].a;
1151 double *a12 = a11+clength;
1152 double *a21 = a12+clength;
1153 double *a22 = a21+clength;
1154
1155 if (set->flags & FPT_NO_STABILIZATION)
1156 {
1157 /* Evaluate P_{2^{tau}-2}^n(\cdot,2^{tau+1}l+2). */
1158 calpha--;
1159 cbeta--;
1160 cgamma--;
1161 eval_clenshaw2(set->xcvecs[tau-1], a11, a21, clength, clength, degree-1, calpha, cbeta,
1162 cgamma);
1163 eval_clenshaw2(set->xcvecs[tau-1], a12, a22, clength, clength, degree, calpha, cbeta,
1164 cgamma);
1165 needstab = 0;
1166 }
1167 else
1168 {
1169 calpha--;
1170 cbeta--;
1171 cgamma--;
1172 /* Evaluate P_{2^{tau}-1}^n(\cdot,2^{tau+1}l+1). */
1173 needstab = eval_clenshaw_thresh2(set->xcvecs[tau-1], a11, a21, clength,
1174 degree-1, calpha, cbeta, cgamma, threshold);
1175 if (needstab == 0)
1176 {
1177 /* Evaluate P_{2^{tau}}^n(\cdot,2^{tau+1}l+1). */
1178 needstab = eval_clenshaw_thresh2(set->xcvecs[tau-1], a12, a22, clength,
1179 degree, calpha, cbeta, cgamma, threshold);
1180 }
1181 }
1182
1183
1184 /* Check if stabilization needed. */
1185 if (needstab == 0)
1186 {
1187 /* No stabilization needed. */
1188 data->steps[tau][l].g = gam[plength*l+1+1];
1189 data->steps[tau][l].stable = true;
1190 }
1191 else
1192 {
1193 /* Stabilize. */
1194 degree_stab = degree*(2*l+1);
1195 X(next_power_of_2_exp_int)((l+1)*(1<<(tau+1)),&N_stab,&t_stab);
1196
1197 /* Old arrays are to small. */
1198 nfft_free(data->steps[tau][l].a);
1199 a11 = NULL;
1200 a12 = NULL;
1201 a21 = NULL;
1202 a22 = NULL;
1203
1204 plength_stab = N_stab;
1205
1206 if (set->flags & FPT_AL_SYMMETRY)
1207 {
1208 if (m <= 1)
1209 {
1210 /* This should never be executed */
1211 clength_1 = plength_stab;
1212 clength_2 = plength_stab;
1213 /* Allocate memory for arrays. */
1214 data->steps[tau][l].a = (double*) nfft_malloc(sizeof(double)*(clength_1*2+clength_2*2));
1215 a11 = data->steps[tau][l].a;
1216 a12 = a11+clength_1;
1217 a21 = a12+clength_1;
1218 a22 = a21+clength_2;
1219 /* Get the pointers to the three-term recurrence coeffcients. */
1220 calpha = &(alpha[1]); cbeta = &(beta[1]); cgamma = &(gam[1]);
1221 eval_clenshaw2(set->xcvecs[t_stab-2], a11, a21, clength_1,
1222 clength_2, degree_stab-1, calpha, cbeta, cgamma);
1223 eval_clenshaw2(set->xcvecs[t_stab-2], a12, a22, clength_1,
1224 clength_2, degree_stab+0, calpha, cbeta, cgamma);
1225 }
1226 else
1227 {
1228 clength = plength_stab/2;
1229 data->steps[tau][l].a = (double*) nfft_malloc(sizeof(double)*clength*2);
1230 if (m%2 == 0)
1231 {
1232 a11 = data->steps[tau][l].a;
1233 a12 = a11+clength;
1234 calpha = &(alpha[2]); cbeta = &(beta[2]); cgamma = &(gam[2]);
1235 eval_clenshaw(set->xcvecs[t_stab-2], a11, clength,
1236 degree_stab-2, calpha, cbeta, cgamma);
1237 eval_clenshaw(set->xcvecs[t_stab-2], a12, clength,
1238 degree_stab-1, calpha, cbeta, cgamma);
1239 }
1240 else
1241 {
1242 a21 = data->steps[tau][l].a;
1243 a22 = a21+clength;
1244 calpha = &(alpha[1]); cbeta = &(beta[1]); cgamma = &(gam[1]);
1245 eval_clenshaw(set->xcvecs[t_stab-2], a21, clength,
1246 degree_stab-1,calpha, cbeta, cgamma);
1247 eval_clenshaw(set->xcvecs[t_stab-2], a22, clength,
1248 degree_stab+0, calpha, cbeta, cgamma);
1249 }
1250 }
1251 }
1252 else
1253 {
1254 clength_1 = plength_stab;
1255 clength_2 = plength_stab;
1256 data->steps[tau][l].a = (double*) nfft_malloc(sizeof(double)*(clength_1*2+clength_2*2));
1257 a11 = data->steps[tau][l].a;
1258 a12 = a11+clength_1;
1259 a21 = a12+clength_1;
1260 a22 = a21+clength_2;
1261 calpha = &(alpha[2]);
1262 cbeta = &(beta[2]);
1263 cgamma = &(gam[2]);
1264 calpha--;
1265 cbeta--;
1266 cgamma--;
1267 eval_clenshaw2(set->xcvecs[t_stab-2], a11, a21, clength_1, clength_2, degree_stab-1,
1268 calpha, cbeta, cgamma);
1269 eval_clenshaw2(set->xcvecs[t_stab-2], a12, a22, clength_1, clength_2, degree_stab+0,
1270 calpha, cbeta, cgamma);
1271
1272 }
1273
1274 data->steps[tau][l].g = gam[1+1];
1275 data->steps[tau][l].stable = false;
1276 data->steps[tau][l].ts = t_stab;
1277 data->steps[tau][l].Ns = N_stab;
1278 }
1279 }
1281 plength = plength << 1;
1282 }
1283 data->precomputed = true;
1284 }
1285
1286 if (!(set->flags & FPT_NO_DIRECT_ALGORITHM))
1287 {
1288 /* Check, if recurrence coefficients must be copied. */
1289 if (set->flags & FPT_PERSISTENT_DATA)
1290 {
1291 data->_alpha = (double*) alpha;
1292 data->_beta = (double*) beta;
1293 data->_gamma = (double*) gam;
1294 }
1295 else
1296 {/* moved to fpt_precompute_1
1297 data->_alpha = (double*) nfft_malloc((set->N+1)*sizeof(double));
1298 data->_beta = (double*) nfft_malloc((set->N+1)*sizeof(double));
1299 data->_gamma = (double*) nfft_malloc((set->N+1)*sizeof(double));*/
1300 memcpy(data->_alpha,alpha,(set->N+1)*sizeof(double));
1301 memcpy(data->_beta,beta,(set->N+1)*sizeof(double));
1302 memcpy(data->_gamma,gam,(set->N+1)*sizeof(double));
1303 }
1304 }
1305}
1306
1307void fpt_precompute(fpt_set set, const int m, double *alpha, double *beta,
1308 double *gam, int k_start, const double threshold)
1309{
1310 fpt_precompute_1(set, m, k_start);
1311 fpt_precompute_2(set, m, alpha, beta, gam, k_start, threshold);
1312}
1313
1314void fpt_trafo_direct(fpt_set set, const int m, const double _Complex *x, double _Complex *y,
1315 const int k_end, const unsigned int flags)
1316{
1317 int j;
1318 fpt_data *data = &(set->dpt[m]);
1319 int Nk;
1320 int tk;
1321 double norm;
1322
1323 //fprintf(stderr, "Executing dpt.\n");
1324
1325 X(next_power_of_2_exp_int)(k_end+1,&Nk,&tk);
1326 norm = 2.0/(Nk<<1);
1327
1328 //fprintf(stderr, "Norm = %e.\n", norm);
1329
1330 if (set->flags & FPT_NO_DIRECT_ALGORITHM)
1331 {
1332 return;
1333 }
1334
1335 if (flags & FPT_FUNCTION_VALUES)
1336 {
1337 /* Fill array with Chebyshev nodes. */
1338 for (j = 0; j <= k_end; j++)
1339 {
1340 set->xc_slow[j] = cos((KPI*(j+0.5))/(k_end+1));
1341 //fprintf(stderr, "x[%4d] = %e.\n", j, set->xc_slow[j]);
1342 }
1343
1344 memset(set->result,0U,data->k_start*sizeof(double _Complex));
1345 memcpy(&set->result[data->k_start],x,(k_end-data->k_start+1)*sizeof(double _Complex));
1346
1347 /*eval_sum_clenshaw(k_end, k_end, set->result, set->xc_slow,
1348 y, set->work, &data->alpha[1], &data->beta[1], &data->gamma[1],
1349 data->gamma_m1);*/
1350 eval_sum_clenshaw_fast(k_end, k_end, set->result, set->xc_slow,
1351 y, &data->_alpha[1], &data->_beta[1], &data->_gamma[1], data->gamma_m1);
1352 }
1353 else
1354 {
1355 memset(set->temp,0U,data->k_start*sizeof(double _Complex));
1356 memcpy(&set->temp[data->k_start],x,(k_end-data->k_start+1)*sizeof(double _Complex));
1357
1358 eval_sum_clenshaw_fast(k_end, Nk-1, set->temp, set->xcvecs[tk-2],
1359 set->result, &data->_alpha[1], &data->_beta[1], &data->_gamma[1],
1360 data->gamma_m1);
1361
1362 fftw_execute_r2r(set->plans_dct2[tk-2],(double*)set->result,
1363 (double*)set->result);
1364
1365 set->result[0] *= 0.5;
1366 for (j = 0; j < Nk; j++)
1367 {
1368 set->result[j] *= norm;
1369 }
1370
1371 memcpy(y,set->result,(k_end+1)*sizeof(double _Complex));
1372 }
1373}
1374
1375void fpt_trafo(fpt_set set, const int m, const double _Complex *x, double _Complex *y,
1376 const int k_end, const unsigned int flags)
1377{
1378 /* Get transformation data. */
1379 fpt_data *data = &(set->dpt[m]);
1381 int Nk;
1383 int tk;
1385 int k_start_tilde;
1387 int k_end_tilde;
1388
1390 int tau;
1392 int firstl;
1394 int lastl;
1396 int l;
1398 int plength;
1400 int plength_stab;
1401 int t_stab;
1403 fpt_step *step;
1405 fftw_plan plan = 0;
1406 int length = k_end+1;
1407 fftw_r2r_kind kinds[2] = {FFTW_REDFT01,FFTW_REDFT01};
1408
1410 int k;
1411
1412 double _Complex *work_ptr;
1413 const double _Complex *x_ptr;
1414
1415 /* Check, if slow transformation should be used due to small bandwidth. */
1416 if (k_end < FPT_BREAK_EVEN)
1417 {
1418 /* Use NDSFT. */
1419 fpt_trafo_direct(set, m, x, y, k_end, flags);
1420 return;
1421 }
1422
1423 X(next_power_of_2_exp_int)(k_end,&Nk,&tk);
1424 k_start_tilde = K_START_TILDE(data->k_start,Nk);
1425 k_end_tilde = K_END_TILDE(k_end,Nk);
1426
1427 /* Check if fast transform is activated. */
1428 if (set->flags & FPT_NO_FAST_ALGORITHM)
1429 return;
1430
1431 if (flags & FPT_FUNCTION_VALUES)
1432 {
1433#ifdef _OPENMP
1434 int nthreads = X(get_num_threads)();
1435#pragma omp critical (nfft_omp_critical_fftw_plan)
1436{
1437 fftw_plan_with_nthreads(nthreads);
1438#endif
1439 plan = fftw_plan_many_r2r(1, &length, 2, (double*)set->work, NULL, 2, 1,
1440 (double*)set->work, NULL, 2, 1, kinds, 0U);
1441#ifdef _OPENMP
1442}
1443#endif
1444 }
1445
1446 /* Initialize working arrays. */
1447 memset(set->result,0U,2*Nk*sizeof(double _Complex));
1448
1449 /* The first step. */
1450
1451 /* Set the first 2*data->k_start coefficients to zero. */
1452 memset(set->work,0U,2*data->k_start*sizeof(double _Complex));
1453
1454 work_ptr = &set->work[2*data->k_start];
1455 x_ptr = x;
1456
1457 for (k = 0; k <= k_end_tilde-data->k_start; k++)
1458 {
1459 *work_ptr++ = *x_ptr++;
1460 *work_ptr++ = K(0.0);
1461 }
1462
1463 /* Set the last 2*(set->N-1-k_end_tilde) coefficients to zero. */
1464 memset(&set->work[2*(k_end_tilde+1)],0U,2*(Nk-1-k_end_tilde)*sizeof(double _Complex));
1465
1466 /* If k_end == Nk, use three-term recurrence to map last coefficient x_{Nk} to
1467 * x_{Nk-1} and x_{Nk-2}. */
1468 if (k_end == Nk)
1469 {
1470 set->work[2*(Nk-2)] += data->gammaN[tk-2]*x[Nk-data->k_start];
1471 set->work[2*(Nk-1)] += data->betaN[tk-2]*x[Nk-data->k_start];
1472 set->work[2*(Nk-1)+1] = data->alphaN[tk-2]*x[Nk-data->k_start];
1473 }
1474
1475 /* Compute the remaining steps. */
1476 plength = 4;
1477 for (tau = 1; tau < tk; tau++)
1478 {
1479 /* Compute first l. */
1480 firstl = FIRST_L(k_start_tilde,plength);
1481 /* Compute last l. */
1482 lastl = LAST_L(k_end_tilde,plength);
1483
1484 /* Compute the multiplication steps. */
1485 for (l = firstl; l <= lastl; l++)
1486 {
1487 /* Copy vectors to multiply into working arrays zero-padded to twice the length. */
1488 memcpy(set->vec3,&(set->work[(plength/2)*(4*l+2)]),(plength/2)*sizeof(double _Complex));
1489 memcpy(set->vec4,&(set->work[(plength/2)*(4*l+3)]),(plength/2)*sizeof(double _Complex));
1490 memset(&set->vec3[plength/2],0U,(plength/2)*sizeof(double _Complex));
1491 memset(&set->vec4[plength/2],0U,(plength/2)*sizeof(double _Complex));
1492
1493 /* Copy coefficients into first half. */
1494 memcpy(&(set->work[(plength/2)*(4*l+2)]),&(set->work[(plength/2)*(4*l+1)]),(plength/2)*sizeof(double _Complex));
1495 memset(&(set->work[(plength/2)*(4*l+1)]),0U,(plength/2)*sizeof(double _Complex));
1496 memset(&(set->work[(plength/2)*(4*l+3)]),0U,(plength/2)*sizeof(double _Complex));
1497
1498 /* Get matrix U_{n,tau,l} */
1499 step = &(data->steps[tau][l]);
1500
1501 /* Check if step is stable. */
1502 if (step->stable)
1503 {
1504 /* Check, if we should do a symmetrizised step. */
1505 if ((set->flags & FPT_AL_SYMMETRY) && IS_SYMMETRIC(l,m,plength))
1506 {
1507 int clength = 1<<(tau);
1508 double *a11 = step->a;
1509 double *a12 = a11+clength;
1510 double *a21 = a12+clength;
1511 double *a22 = a21+clength;
1512 /*for (k = 0; k < plength; k++)
1513 {
1514 fprintf(stderr,"fpt_trafo: a11 = %le, a12 = %le, a21 = %le, a22 = %le\n",
1515 a11[k],a12[k],a21[k],step->a22[k]);
1516 }*/
1517 /* Multiply third and fourth polynomial with matrix U. */
1518 //fprintf(stderr,"\nhallo\n");
1519 fpt_do_step_symmetric(set->vec3, set->vec4, a11,
1520 a12, a21, a22, step->g, tau, set);
1521 }
1522 else
1523 {
1524 int clength = 1<<(tau+1);
1525 double *a11 = step->a;
1526 double *a12 = a11+clength;
1527 double *a21 = a12+clength;
1528 double *a22 = a21+clength;
1529 /* Multiply third and fourth polynomial with matrix U. */
1530 fpt_do_step(set->vec3, set->vec4, a11, a12,
1531 a21, a22, step->g, tau, set);
1532 }
1533
1534 if (step->g != 0.0)
1535 {
1536 for (k = 0; k < plength; k++)
1537 {
1538 set->work[plength*2*l+k] += set->vec3[k];
1539 }
1540 }
1541 for (k = 0; k < plength; k++)
1542 {
1543 set->work[plength*(2*l+1)+k] += set->vec4[k];
1544 }
1545 }
1546 else
1547 {
1548 /* Stabilize. */
1549
1550 /* The lengh of the polynomials */
1551 plength_stab = step->Ns;
1552 t_stab = step->ts;
1553
1554 /*---------*/
1555 /*fprintf(stderr,"\nfpt_trafo: stabilizing at tau = %d, l = %d.\n",tau,l);
1556 fprintf(stderr,"\nfpt_trafo: plength_stab = %d.\n",plength_stab);
1557 fprintf(stderr,"\nfpt_trafo: tk = %d.\n",tk);
1558 fprintf(stderr,"\nfpt_trafo: index = %d.\n",tk-tau-1);*/
1559 /*---------*/
1560
1561 /* Set rest of vectors explicitely to zero */
1562 /*fprintf(stderr,"fpt_trafo: stabilizing: plength = %d, plength_stab = %d\n",
1563 plength, plength_stab);*/
1564 memset(&set->vec3[plength/2],0U,(plength_stab-plength/2)*sizeof(double _Complex));
1565 memset(&set->vec4[plength/2],0U,(plength_stab-plength/2)*sizeof(double _Complex));
1566
1567 /* Multiply third and fourth polynomial with matrix U. */
1568 /* Check for symmetry. */
1569 if (set->flags & FPT_AL_SYMMETRY)
1570 {
1571 if (m <= 1)
1572 {
1573 int clength_1 = plength_stab;
1574 int clength_2 = plength_stab;
1575 double *a11 = step->a;
1576 double *a12 = a11+clength_1;
1577 double *a21 = a12+clength_1;
1578 double *a22 = a21+clength_2;
1579 fpt_do_step_symmetric(set->vec3, set->vec4, a11, a12,
1580 a21, a22, step->g, t_stab-1, set);
1581 }
1582 else if (m%2 == 0)
1583 {
1584 int clength = plength_stab/2;
1585 double *a11 = step->a;
1586 double *a12 = a11+clength;
1587 double *a21 = NULL;
1588 double *a22 = NULL;
1589 fpt_do_step_symmetric_u(set->vec3, set->vec4, a11, a12,
1590 a21, a22,
1591 set->xcvecs[t_stab-2], step->g, t_stab-1, set);
1592 }
1593 else
1594 {
1595 int clength = plength_stab/2;
1596 double *a11 = NULL;
1597 double *a12 = NULL;
1598 double *a21 = step->a;
1599 double *a22 = a21+clength;
1600 fpt_do_step_symmetric_l(set->vec3, set->vec4,
1601 a11, a12,
1602 a21,
1603 a22, set->xcvecs[t_stab-2], step->g, t_stab-1, set);
1604 }
1605 }
1606 else
1607 {
1608 int clength_1 = plength_stab;
1609 int clength_2 = plength_stab;
1610 double *a11 = step->a;
1611 double *a12 = a11+clength_1;
1612 double *a21 = a12+clength_1;
1613 double *a22 = a21+clength_2;
1614 fpt_do_step(set->vec3, set->vec4, a11, a12,
1615 a21, a22, step->g, t_stab-1, set);
1616 }
1617
1618 if (step->g != 0.0)
1619 {
1620 for (k = 0; k < plength_stab; k++)
1621 {
1622 set->result[k] += set->vec3[k];
1623 }
1624 }
1625
1626 for (k = 0; k < plength_stab; k++)
1627 {
1628 set->result[Nk+k] += set->vec4[k];
1629 }
1630 }
1631 }
1632 /* Double length of polynomials. */
1633 plength = plength<<1;
1634
1635 /*--------*/
1636 /*for (k = 0; k < 2*Nk; k++)
1637 {
1638 fprintf(stderr,"work[%2d] = %le + I*%le\tresult[%2d] = %le + I*%le\n",
1639 k,creal(set->work[k]),cimag(set->work[k]),k,creal(set->result[k]),
1640 cimag(set->result[k]));
1641 }*/
1642 /*--------*/
1643 }
1644
1645 /* Add the resulting cascade coeffcients to the coeffcients accumulated from
1646 * the stabilization steps. */
1647 for (k = 0; k < 2*Nk; k++)
1648 {
1649 set->result[k] += set->work[k];
1650 }
1651
1652 /* The last step. Compute the Chebyshev coeffcients c_k^n from the
1653 * polynomials in front of P_0^n and P_1^n. */
1654 y[0] = data->gamma_m1*(set->result[0] + data->beta_0*set->result[Nk] +
1655 data->alpha_0*set->result[Nk+1]*0.5);
1656 y[1] = data->gamma_m1*(set->result[1] + data->beta_0*set->result[Nk+1]+
1657 data->alpha_0*(set->result[Nk]+set->result[Nk+2]*0.5));
1658 y[k_end-1] = data->gamma_m1*(set->result[k_end-1] +
1659 data->beta_0*set->result[Nk+k_end-1] +
1660 data->alpha_0*set->result[Nk+k_end-2]*0.5);
1661 y[k_end] = data->gamma_m1*(0.5*data->alpha_0*set->result[Nk+k_end-1]);
1662 for (k = 2; k <= k_end-2; k++)
1663 {
1664 y[k] = data->gamma_m1*(set->result[k] + data->beta_0*set->result[Nk+k] +
1665 data->alpha_0*0.5*(set->result[Nk+k-1]+set->result[Nk+k+1]));
1666 }
1667
1668 if (flags & FPT_FUNCTION_VALUES)
1669 {
1670 y[0] *= 2.0;
1671 fftw_execute_r2r(plan,(double*)y,(double*)y);
1672#ifdef _OPENMP
1673 #pragma omp critical (nfft_omp_critical_fftw_plan)
1674#endif
1675 fftw_destroy_plan(plan);
1676 for (k = 0; k <= k_end; k++)
1677 {
1678 y[k] *= 0.5;
1679 }
1680 }
1681}
1682
1683void fpt_transposed_direct(fpt_set set, const int m, double _Complex *x,
1684 double _Complex *y, const int k_end, const unsigned int flags)
1685{
1686 int j;
1687 fpt_data *data = &(set->dpt[m]);
1688 int Nk;
1689 int tk;
1690 double norm;
1691
1692 X(next_power_of_2_exp_int)(k_end+1,&Nk,&tk);
1693 norm = 2.0/(Nk<<1);
1694
1695 if (set->flags & FPT_NO_DIRECT_ALGORITHM)
1696 {
1697 return;
1698 }
1699
1700 if (flags & FPT_FUNCTION_VALUES)
1701 {
1702 for (j = 0; j <= k_end; j++)
1703 {
1704 set->xc_slow[j] = cos((KPI*(j+0.5))/(k_end+1));
1705 }
1706
1707 eval_sum_clenshaw_transposed(k_end, k_end, set->result, set->xc_slow,
1708 y, set->work, &data->_alpha[1], &data->_beta[1], &data->_gamma[1],
1709 data->gamma_m1);
1710
1711 memcpy(x,&set->result[data->k_start],(k_end-data->k_start+1)*
1712 sizeof(double _Complex));
1713 }
1714 else
1715 {
1716 memcpy(set->result,y,(k_end+1)*sizeof(double _Complex));
1717 memset(&set->result[k_end+1],0U,(Nk-k_end-1)*sizeof(double _Complex));
1718
1719 for (j = 0; j < Nk; j++)
1720 {
1721 set->result[j] *= norm;
1722 }
1723
1724 fftw_execute_r2r(set->plans_dct3[tk-2],(double*)set->result,
1725 (double*)set->result);
1726
1727 if (set->N > 1024)
1728 eval_sum_clenshaw_transposed_ld(k_end, Nk-1, set->temp, set->xcvecs[tk-2],
1729 set->result, set->work, &data->_alpha[1], &data->_beta[1], &data->_gamma[1],
1730 data->gamma_m1);
1731 else
1732 eval_sum_clenshaw_transposed(k_end, Nk-1, set->temp, set->xcvecs[tk-2],
1733 set->result, set->work, &data->_alpha[1], &data->_beta[1], &data->_gamma[1],
1734 data->gamma_m1);
1735
1736 memcpy(x,&set->temp[data->k_start],(k_end-data->k_start+1)*sizeof(double _Complex));
1737 }
1738}
1739
1740void fpt_transposed(fpt_set set, const int m, double _Complex *x,
1741 double _Complex *y, const int k_end, const unsigned int flags)
1742{
1743 /* Get transformation data. */
1744 fpt_data *data = &(set->dpt[m]);
1746 int Nk;
1748 int tk;
1750 int k_start_tilde;
1752 int k_end_tilde;
1753
1755 int tau;
1757 int firstl;
1759 int lastl;
1761 int l;
1763 int plength;
1765 int plength_stab;
1767 fpt_step *step;
1769 fftw_plan plan;
1770 int length = k_end+1;
1771 fftw_r2r_kind kinds[2] = {FFTW_REDFT10,FFTW_REDFT10};
1773 int k;
1774 int t_stab;
1775
1776 /* Check, if slow transformation should be used due to small bandwidth. */
1777 if (k_end < FPT_BREAK_EVEN)
1778 {
1779 /* Use NDSFT. */
1780 fpt_transposed_direct(set, m, x, y, k_end, flags);
1781 return;
1782 }
1783
1784 X(next_power_of_2_exp_int)(k_end,&Nk,&tk);
1785 k_start_tilde = K_START_TILDE(data->k_start,Nk);
1786 k_end_tilde = K_END_TILDE(k_end,Nk);
1787
1788 /* Check if fast transform is activated. */
1789 if (set->flags & FPT_NO_FAST_ALGORITHM)
1790 {
1791 return;
1792 }
1793
1794 if (flags & FPT_FUNCTION_VALUES)
1795 {
1796#ifdef _OPENMP
1797 int nthreads = X(get_num_threads)();
1798#pragma omp critical (nfft_omp_critical_fftw_plan)
1799{
1800 fftw_plan_with_nthreads(nthreads);
1801#endif
1802 plan = fftw_plan_many_r2r(1, &length, 2, (double*)set->work, NULL, 2, 1,
1803 (double*)set->work, NULL, 2, 1, kinds, 0U);
1804#ifdef _OPENMP
1805}
1806#endif
1807 fftw_execute_r2r(plan,(double*)y,(double*)set->result);
1808#ifdef _OPENMP
1809 #pragma omp critical (nfft_omp_critical_fftw_plan)
1810#endif
1811 fftw_destroy_plan(plan);
1812 for (k = 0; k <= k_end; k++)
1813 {
1814 set->result[k] *= 0.5;
1815 }
1816 }
1817 else
1818 {
1819 memcpy(set->result,y,(k_end+1)*sizeof(double _Complex));
1820 }
1821
1822 /* Initialize working arrays. */
1823 memset(set->work,0U,2*Nk*sizeof(double _Complex));
1824
1825 /* The last step is now the first step. */
1826 for (k = 0; k <= k_end; k++)
1827 {
1828 set->work[k] = data->gamma_m1*set->result[k];
1829 }
1830 //memset(&set->work[k_end+1],0U,(Nk+1-k_end)*sizeof(double _Complex));
1831
1832 set->work[Nk] = data->gamma_m1*(data->beta_0*set->result[0] +
1833 data->alpha_0*set->result[1]);
1834 for (k = 1; k < k_end; k++)
1835 {
1836 set->work[Nk+k] = data->gamma_m1*(data->beta_0*set->result[k] +
1837 data->alpha_0*0.5*(set->result[k-1]+set->result[k+1]));
1838 }
1839 if (k_end<Nk)
1840 {
1841 memset(&set->work[k_end],0U,(Nk-k_end)*sizeof(double _Complex));
1842 }
1843
1845 memcpy(set->result,set->work,2*Nk*sizeof(double _Complex));
1846
1847 /* Compute the remaining steps. */
1848 plength = Nk;
1849 for (tau = tk-1; tau >= 1; tau--)
1850 {
1851 /* Compute first l. */
1852 firstl = FIRST_L(k_start_tilde,plength);
1853 /* Compute last l. */
1854 lastl = LAST_L(k_end_tilde,plength);
1855
1856 /* Compute the multiplication steps. */
1857 for (l = firstl; l <= lastl; l++)
1858 {
1859 /* Initialize second half of coefficient arrays with zeros. */
1860 memcpy(set->vec3,&(set->work[(plength/2)*(4*l+0)]),plength*sizeof(double _Complex));
1861 memcpy(set->vec4,&(set->work[(plength/2)*(4*l+2)]),plength*sizeof(double _Complex));
1862
1863 memcpy(&set->work[(plength/2)*(4*l+1)],&(set->work[(plength/2)*(4*l+2)]),
1864 (plength/2)*sizeof(double _Complex));
1865
1866 /* Get matrix U_{n,tau,l} */
1867 step = &(data->steps[tau][l]);
1868
1869 /* Check if step is stable. */
1870 if (step->stable)
1871 {
1872 if ((set->flags & FPT_AL_SYMMETRY) && IS_SYMMETRIC(l,m,plength))
1873 {
1874 /* Multiply third and fourth polynomial with matrix U. */
1875 int clength = 1<<(tau);
1876 double *a11 = step->a;
1877 double *a12 = a11+clength;
1878 double *a21 = a12+clength;
1879 double *a22 = a21+clength;
1880 fpt_do_step_t_symmetric(set->vec3, set->vec4, a11, a12,
1881 a21, a22, step->g, tau, set);
1882 }
1883 else
1884 {
1885 /* Multiply third and fourth polynomial with matrix U. */
1886 int clength = 1<<(tau+1);
1887 double *a11 = step->a;
1888 double *a12 = a11+clength;
1889 double *a21 = a12+clength;
1890 double *a22 = a21+clength;
1891 fpt_do_step_t(set->vec3, set->vec4, a11, a12,
1892 a21, a22, step->g, tau, set);
1893 }
1894 memcpy(&(set->vec3[plength/2]), set->vec4,(plength/2)*sizeof(double _Complex));
1895
1896 for (k = 0; k < plength; k++)
1897 {
1898 set->work[plength*(4*l+2)/2+k] = set->vec3[k];
1899 }
1900 }
1901 else
1902 {
1903 /* Stabilize. */
1904 plength_stab = step->Ns;
1905 t_stab = step->ts;
1906
1907 memcpy(set->vec3,set->result,plength_stab*sizeof(double _Complex));
1908 memcpy(set->vec4,&(set->result[Nk]),plength_stab*sizeof(double _Complex));
1909
1910 /* Multiply third and fourth polynomial with matrix U. */
1911 if (set->flags & FPT_AL_SYMMETRY)
1912 {
1913 if (m <= 1)
1914 {
1915 int clength_1 = plength_stab;
1916 int clength_2 = plength_stab;
1917 double *a11 = step->a;
1918 double *a12 = a11+clength_1;
1919 double *a21 = a12+clength_1;
1920 double *a22 = a21+clength_2;
1921 fpt_do_step_t_symmetric(set->vec3, set->vec4, a11, a12,
1922 a21, a22, step->g, t_stab-1, set);
1923 }
1924 else if (m%2 == 0)
1925 {
1926 int clength = plength_stab/2;
1927 double *a11 = step->a;
1928 double *a12 = a11+clength;
1929 fpt_do_step_t_symmetric_u(set->vec3, set->vec4, a11, a12,
1930 set->xcvecs[t_stab-2], step->g, t_stab-1, set);
1931 }
1932 else
1933 {
1934 int clength = plength_stab/2;
1935 double *a21 = step->a;
1936 double *a22 = a21+clength;
1937 fpt_do_step_t_symmetric_l(set->vec3, set->vec4,
1938 a21, a22, set->xcvecs[t_stab-2], step->g, t_stab-1, set);
1939 }
1940 }
1941 else
1942 {
1943 int clength_1 = plength_stab;
1944 int clength_2 = plength_stab;
1945 double *a11 = step->a;
1946 double *a12 = a11+clength_1;
1947 double *a21 = a12+clength_1;
1948 double *a22 = a21+clength_2;
1949 fpt_do_step_t(set->vec3, set->vec4, a11, a12,
1950 a21, a22, step->g, t_stab-1, set);
1951 }
1952
1953 memcpy(&(set->vec3[plength/2]),set->vec4,(plength/2)*sizeof(double _Complex));
1954
1955 for (k = 0; k < plength; k++)
1956 {
1957 set->work[(plength/2)*(4*l+2)+k] = set->vec3[k];
1958 }
1959 }
1960 }
1961 /* Half the length of polynomial arrays. */
1962 plength = plength>>1;
1963 }
1964
1965 /* First step */
1966 for (k = 0; k <= k_end_tilde-data->k_start; k++)
1967 {
1968 x[k] = set->work[2*(data->k_start+k)];
1969 }
1970 if (k_end == Nk)
1971 {
1972 x[Nk-data->k_start] =
1973 data->gammaN[tk-2]*set->work[2*(Nk-2)]
1974 + data->betaN[tk-2] *set->work[2*(Nk-1)]
1975 + data->alphaN[tk-2]*set->work[2*(Nk-1)+1];
1976 }
1977}
1978
1979void fpt_finalize(fpt_set set)
1980{
1981 int tau;
1982 int l;
1983 int m;
1984 int k_start_tilde;
1985 int N_tilde;
1986 int firstl, lastl;
1987 int plength;
1988 const int M = set->M;
1989
1990 /* TODO Clean up DPT transform data structures. */
1991 if (!(set->flags & FPT_NO_INIT_FPT_DATA))
1992 {
1993 for (m = 0; m < M; m++)
1994 {
1995 /* Check if precomputed. */
1996 fpt_data *data = &set->dpt[m];
1997 if (data->steps != (fpt_step**)NULL)
1998 {
1999 if (!(set->flags & FPT_NO_FAST_ALGORITHM))
2000 {
2001 nfft_free(data->alphaN);
2002 data->alphaN = NULL;
2003 data->betaN = NULL;
2004 data->gammaN = NULL;
2005 }
2006
2007 /* Free precomputed data. */
2008 k_start_tilde = K_START_TILDE(data->k_start,X(next_power_of_2)(data->k_start)
2009 /*set->N*/);
2010 N_tilde = N_TILDE(set->N);
2011 plength = 4;
2012 for (tau = 1; tau < set->t; tau++)
2013 {
2014 /* Compute first l. */
2015 firstl = FIRST_L(k_start_tilde,plength);
2016 /* Compute last l. */
2017 lastl = LAST_L(N_tilde,plength);
2018
2019 /* For l = 0,...2^{t-tau-1}-1 compute the matrices U_{n,tau,l}. */
2020 for (l = firstl; l <= lastl; l++)
2021 {
2022 /* Free components. */
2023 if (data->steps[tau][l].a != NULL)
2024 {
2025 nfft_free(data->steps[tau][l].a);
2026 data->steps[tau][l].a = NULL;
2027 }
2028 }
2029 /* Free pointers for current level. */
2030 nfft_free(data->steps[tau]);
2031 data->steps[tau] = NULL;
2032 /* Double length of polynomials. */
2033 plength = plength<<1;
2034 }
2035 /* Free steps. */
2036 nfft_free(data->steps);
2037 data->steps = NULL;
2038 }
2039
2040 if (!(set->flags & FPT_NO_DIRECT_ALGORITHM))
2041 {
2042 /* Check, if recurrence coefficients must be copied. */
2043 if (!(set->flags & FPT_PERSISTENT_DATA))
2044 {
2045 if (data->_alpha != NULL)
2046 nfft_free(data->_alpha);
2047 }
2048 data->_alpha = NULL;
2049 data->_beta = NULL;
2050 data->_gamma = NULL;
2051 }
2052 }
2053
2054 /* Delete array of DPT transform data. */
2055 nfft_free(set->dpt);
2056 set->dpt = NULL;
2057 }
2058
2059 for (tau = 1; tau < set->t+1; tau++)
2060 {
2061 nfft_free(set->xcvecs[tau-1]);
2062 set->xcvecs[tau-1] = NULL;
2063 }
2064 nfft_free(set->xcvecs);
2065 set->xcvecs = NULL;
2066
2067 /* Free auxilliary arrays. */
2068 nfft_free(set->work);
2069 nfft_free(set->result);
2070 set->work = NULL;
2071 set->result = NULL;
2072
2073 /* Free FFTW plans. */
2074 for(tau = 0; tau < set->t/*-1*/; tau++)
2075 {
2076#ifdef _OPENMP
2077#pragma omp critical (nfft_omp_critical_fftw_plan)
2078#endif
2079{
2080 fftw_destroy_plan(set->plans_dct3[tau]);
2081 fftw_destroy_plan(set->plans_dct2[tau]);
2082}
2083 set->plans_dct3[tau] = NULL;
2084 set->plans_dct2[tau] = NULL;
2085 }
2086
2087 nfft_free(set->plans_dct3);
2088 nfft_free(set->plans_dct2);
2089 set->plans_dct3 = NULL;
2090 set->plans_dct2 = NULL;
2091
2092 /* Check if fast transform is activated. */
2093 if (!(set->flags & FPT_NO_FAST_ALGORITHM))
2094 {
2095 /* Free auxilliary arrays. */
2096 nfft_free(set->vec3);
2097 nfft_free(set->vec4);
2098 nfft_free(set->z);
2099 set->vec3 = NULL;
2100 set->vec4 = NULL;
2101 set->z = NULL;
2102 }
2103
2104 if (!(set->flags & FPT_NO_DIRECT_ALGORITHM))
2105 {
2106 /* Delete arrays of Chebyshev nodes. */
2107 nfft_free(set->xc_slow);
2108 set->xc_slow = NULL;
2109 nfft_free(set->temp);
2110 set->temp = NULL;
2111 }
2112
2113 /* Free DPT set structure. */
2114 nfft_free(set);
2115}
static void eval_sum_clenshaw_transposed(int N, int M, double _Complex *a, double *x, double _Complex *y, double _Complex *temp, double *alpha, double *beta, double *gam, double lambda)
Clenshaw algorithm.
Definition fpt.c:717
#define K_START_TILDE(x, y)
If defined, perform critical computations in three-term recurrence always in long double instead of u...
Definition fpt.c:48
#define K_END_TILDE(x, y)
Maximum degree at top of a cascade.
Definition fpt.c:51
#define FIRST_L(x, y)
Index of first block of four functions at level.
Definition fpt.c:54
#define LAST_L(x, y)
Index of last block of four functions at level.
Definition fpt.c:57
#define FPT_PERSISTENT_DATA
If set, TODO complete comment.
Definition nfft3.h:646
#define FPT_NO_FAST_ALGORITHM
If set, TODO complete comment.
Definition nfft3.h:644
#define FPT_NO_STABILIZATION
If set, no stabilization will be used.
Definition nfft3.h:643
#define FPT_AL_SYMMETRY
If set, TODO complete comment.
Definition nfft3.h:651
#define FPT_FUNCTION_VALUES
If set, the output are function values at Chebyshev nodes rather than Chebyshev coefficients.
Definition nfft3.h:650
#define FPT_NO_DIRECT_ALGORITHM
If set, TODO complete comment.
Definition nfft3.h:645
#define UNUSED(x)
Dummy use of unused parameters to silence compiler warnings.
Definition infft.h:1319
Internal header file for auxiliary definitions and functions.
Header file for the nfft3 library.
void * nfft_malloc(size_t n)
void nfft_free(void *p)
Holds data for a single cascade summation.
Definition fpt.h:46
double * _alpha
< TODO Add comment here.
Definition fpt.h:56
double * _gamma
TODO Add comment here.
Definition fpt.h:58
double * gammaN
TODO Add comment here.
Definition fpt.h:51
fpt_step ** steps
The cascade summation steps
Definition fpt.h:47
double beta_0
TODO Add comment here.
Definition fpt.h:53
double * alphaN
TODO Add comment here.
Definition fpt.h:49
double * betaN
TODO Add comment here.
Definition fpt.h:50
double gamma_m1
TODO Add comment here.
Definition fpt.h:54
double alpha_0
TODO Add comment here.
Definition fpt.h:52
double * _beta
TODO Add comment here.
Definition fpt.h:57
int k_start
TODO Add comment here.
Definition fpt.h:48
Holds data for a set of cascade summations.
Definition fpt.h:66
int t
The exponent of N
Definition fpt.h:71
double ** xcvecs
Array of pointers to arrays containing the Chebyshev nodes
Definition fpt.h:73
int M
The number of DPT transforms
Definition fpt.h:68
unsigned int flags
The flags
Definition fpt.h:67
fftw_r2r_kind * kinds
Transform kinds for fftw library
Definition fpt.h:87
fftw_plan * plans_dct2
Transform plans for the fftw library
Definition fpt.h:85
fpt_data * dpt
The DPT transform data
Definition fpt.h:72
int N
The transform length.
Definition fpt.h:69
fftw_r2r_kind * kindsr
Transform kinds for fftw library
Definition fpt.h:89
fftw_plan * plans_dct3
Transform plans for the fftw library
Definition fpt.h:83
Holds data for a single multiplication step in the cascade summation.
Definition fpt.h:31
double * a
The matrix components
Definition fpt.h:37
bool stable
Indicates if the values contained represent a fast or a slow stabilized step.
Definition fpt.h:32
int Ns
TODO Add comment here.
Definition fpt.h:35
int ts
TODO Add comment here.
Definition fpt.h:36