aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/ia64/fpu/libm_reduce.S
blob: fd7d06d2634220407b84944fcdda7bd07f97a135 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
.file "libm_reduce.s"


// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.

// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History:
// 02/02/00 Initial Version
// 05/13/02 Rescheduled for speed, changed interface to pass
//          parameters in fp registers
// 02/10/03 Reordered header: .section, .global, .proc, .align;
//          used data8 for long double data storage
//
//*********************************************************************
//*********************************************************************
//
// Function:   __libm_pi_by_two_reduce(x) return r, c, and N where
//             x = N * pi/4 + (r+c) , where |r+c| <= pi/4.
//             This function is not designed to be used by the
//             general user.
//
//*********************************************************************
//
// Accuracy:       Returns double-precision values
//
//*********************************************************************
//
// Resources Used:
//
//    Floating-Point Registers:
//      f8  = Input x, return value r
//      f9  = return value c
//      f32-f70
//
//    General Purpose Registers:
//      r8  = return value N
//      r34-r64
//
//    Predicate Registers:      p6-p14
//
//*********************************************************************
//
// IEEE Special Conditions:
//
//    No conditions should be raised.
//
//*********************************************************************
//
// I. Introduction
// ===============
//
// For the forward trigonometric functions sin, cos, sincos, and
// tan, the original algorithms for IA 64 handle arguments up to
// 1 ulp less than 2^63 in magnitude. For double-extended arguments x,
// |x| >= 2^63, this routine returns N and r_hi, r_lo where
//
//    x  is accurately approximated by
//    2*K*pi  +  N * pi/2  +  r_hi + r_lo,  |r_hi+r_lo| <= pi/4.
//    CASE = 1 or 2.
//    CASE is 1 unless |r_hi + r_lo| < 2^(-33).
//
// The exact value of K is not determined, but that information is
// not required in trigonometric function computations.
//
// We first assume the argument x in question satisfies x >= 2^(63).
// In particular, it is positive. Negative x can be handled by symmetry:
//
//   -x  is accurately approximated by
//         -2*K*pi  +  (-N) * pi/2  -  (r_hi + r_lo),  |r_hi+r_lo| <= pi/4.
//
// The idea of the reduction is that
//
//       x  *  2/pi   =   N_big  +  N  +  f,      |f| <= 1/2
//
// Moreover, for double extended x, |f| >= 2^(-75). (This is an
// non-obvious fact found by enumeration using a special algorithm
// involving continued fraction.) The algorithm described below
// calculates N and an accurate approximation of f.
//
// Roughly speaking, an appropriate 256-bit (4 X 64) portion of
// 2/pi is multiplied with x to give the desired information.
//
// II. Representation of 2/PI
// ==========================
//
// The value of 2/pi in binary fixed-point is
//
//            .101000101111100110......
//
// We store 2/pi in a table, starting at the position corresponding
// to bit position 63
//
//   bit position  63 62 ... 0   -1 -2 -3 -4 -5 -6 -7  ....  -16576
//
//              0  0  ... 0  . 1  0  1  0  1  0  1  ....    X
//
//                              ^
//                               |__ implied binary pt
//
// III. Algorithm
// ==============
//
// This describes the algorithm in the most natural way using
// unsigned interger multiplication. The implementation section
// describes how the integer arithmetic is simulated.
//
// STEP 0. Initialization
// ----------------------
//
// Let the input argument x be
//
//     x = 2^m * ( 1. b_1 b_2 b_3 ... b_63 ),  63 <= m <= 16383.
//
// The first crucial step is to fetch four 64-bit portions of 2/pi.
// To fulfill this goal, we calculate the bit position L of the
// beginning of these 256-bit quantity by
//
//     L :=  62 - m.
//
// Note that -16321 <= L <= -1 because 63 <= m <= 16383; and that
// the storage of 2/pi is adequate.
//
// Fetch P_1, P_2, P_3, P_4 beginning at bit position L thus:
//
//      bit position  L  L-1  L-2    ...  L-63
//
//      P_1    =      b   b    b     ...    b
//
// each b can be 0 or 1. Also, let P_0 be the two bits correspoding to
// bit positions L+2 and L+1. So, when each of the P_j is interpreted
// with appropriate scaling, we have
//
//      2/pi  =  P_big  + P_0 + (P_1 + P_2 + P_3 + P_4)  +  P_small
//
// Note that P_big and P_small can be ignored. The reasons are as follow.
// First, consider P_big. If P_big = 0, we can certainly ignore it.
// Otherwise, P_big >= 2^(L+3). Now,
//
//        P_big * ulp(x) >=  2^(L+3) * 2^(m-63)
//                   >=  2^(65-m  +  m-63 )
//                   >=  2^2
//
// Thus, P_big * x is an integer of the form 4*K. So
//
//       x = 4*K * (pi/2) + x*(P_0 + P_1 + P_2 + P_3 + P_4)*(pi/2)
//                + x*P_small*(pi/2).
//
// Hence, P_big*x corresponds to information that can be ignored for
// trigonometic function evaluation.
//
// Next, we must estimate the effect of ignoring P_small. The absolute
// error made by ignoring P_small is bounded by
//
//       |P_small * x|  <=  ulp(P_4) * x
//                  <=  2^(L-255) * 2^(m+1)
//                  <=  2^(62-m-255 + m + 1)
//                  <=  2^(-192)
//
// Since for double-extended precision, x * 2/pi = integer + f,
// 0.5 >= |f| >= 2^(-75), the relative error introduced by ignoring
// P_small is bounded by 2^(-192+75) <= 2^(-117), which is acceptable.
//
// Further note that if x is split into x_hi + x_lo where x_lo is the
// two bits corresponding to bit positions 2^(m-62) and 2^(m-63); then
//
//       P_0 * x_hi
//
// is also an integer of the form 4*K; and thus can also be ignored.
// Let M := P_0 * x_lo which is a small integer. The main part of the
// calculation is really the multiplication of x with the four pieces
// P_1, P_2, P_3, and P_4.
//
// Unless the reduced argument is extremely small in magnitude, it
// suffices to carry out the multiplication of x with P_1, P_2, and
// P_3. x*P_4 will be carried out and added on as a correction only
// when it is found to be needed. Note also that x*P_4 need not be
// computed exactly. A straightforward multiplication suffices since
// the rounding error thus produced would be bounded by 2^(-3*64),
// that is 2^(-192) which is small enough as the reduced argument
// is bounded from below by 2^(-75).
//
// Now that we have four 64-bit data representing 2/pi and a
// 64-bit x. We first need to calculate a highly accurate product
// of x and P_1, P_2, P_3. This is best understood as integer
// multiplication.
//
//
// STEP 1. Multiplication
// ----------------------
//
//
//                     ---------   ---------   ---------
//                    |  P_1  |   |  P_2  |   |  P_3  |
//                    ---------   ---------   ---------
//
//                                            ---------
//             X                              |   X   |
//                                            ---------
//      ----------------------------------------------------
//
//                                 ---------   ---------
//                               |  A_hi |   |  A_lo |
//                               ---------   ---------
//
//
//                    ---------   ---------
//                   |  B_hi |   |  B_lo |
//                   ---------   ---------
//
//
//        ---------   ---------
//       |  C_hi |   |  C_lo |
//       ---------   ---------
//
//      ====================================================
//       ---------   ---------   ---------   ---------
//       |  S_0  |   |  S_1  |   |  S_2  |   |  S_3  |
//       ---------   ---------   ---------   ---------
//
//
//
// STEP 2. Get N and f
// -------------------
//
// Conceptually, after the individual pieces S_0, S_1, ..., are obtained,
// we have to sum them and obtain an integer part, N, and a fraction, f.
// Here, |f| <= 1/2, and N is an integer. Note also that N need only to
// be known to module 2^k, k >= 2. In the case when |f| is small enough,
// we would need to add in the value x*P_4.
//
//
// STEP 3. Get reduced argument
// ----------------------------
//
// The value f is not yet the reduced argument that we seek. The
// equation
//
//       x * 2/pi = 4K  + N  + f
//
// says that
//
//         x   =  2*K*pi  + N * pi/2  +  f * (pi/2).
//
// Thus, the reduced argument is given by
//
//       reduced argument =  f * pi/2.
//
// This multiplication must be performed to extra precision.
//
// IV. Implementation
// ==================
//
// Step 0. Initialization
// ----------------------
//
// Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x.
//
// In memory, 2/pi is stored contiguously as
//
//  0x00000000 0x00000000 0xA2F....
//                       ^
//                       |__ implied binary bit
//
// Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m. Thus
// -1 <= L <= -16321. We fetch from memory 5 integer pieces of data.
//
// P_0 is the two bits corresponding to bit positions L+2 and L+1
// P_1 is the 64-bit starting at bit position  L
// P_2 is the 64-bit starting at bit position  L-64
// P_3 is the 64-bit starting at bit position  L-128
// P_4 is the 64-bit starting at bit position  L-192
//
// For example, if m = 63, P_0 would be 0 and P_1 would look like
// 0xA2F...
//
// If m = 65, P_0 would be the two msb of 0xA, thus, P_0 is 10 in binary.
// P_1 in binary would be  1 0 0 0 1 0 1 1 1 1 ....
//
// Step 1. Multiplication
// ----------------------
//
// At this point, P_1, P_2, P_3, P_4 are integers. They are
// supposed to be interpreted as
//
//  2^(L-63)     * P_1;
//  2^(L-63-64)  * P_2;
//  2^(L-63-128) * P_3;
// 2^(L-63-192) * P_4;
//
// Since each of them need to be multiplied to x, we would scale
// both x and the P_j's by some convenient factors: scale each
// of P_j's up by 2^(63-L), and scale x down by 2^(L-63).
//
//   p_1 := fcvt.xf ( P_1 )
//   p_2 := fcvt.xf ( P_2 ) * 2^(-64)
//   p_3 := fcvt.xf ( P_3 ) * 2^(-128)
//   p_4 := fcvt.xf ( P_4 ) * 2^(-192)
//   x   := replace exponent of x by -1
//          because 2^m    * 1.xxxx...xxx  * 2^(L-63)
//          is      2^(-1) * 1.xxxx...xxx
//
// We are now faced with the task of computing the following
//
//                     ---------   ---------   ---------
//                    |  P_1  |   |  P_2  |   |  P_3  |
//                    ---------   ---------   ---------
//
//                                             ---------
//             X                              |   X   |
//                                            ---------
//       ----------------------------------------------------
//
//                                 ---------   ---------
//                                |  A_hi |   |  A_lo |
//                                ---------   ---------
//
//                     ---------   ---------
//                    |  B_hi |   |  B_lo |
//                    ---------   ---------
//
//         ---------   ---------
//        |  C_hi |   |  C_lo |
//        ---------   ---------
//
//      ====================================================
//       -----------   ---------   ---------   ---------
//       |    S_0  |   |  S_1  |   |  S_2  |   |  S_3  |
//       -----------   ---------   ---------   ---------
//        ^          ^
//        |          |___ binary point
//        |
//        |___ possibly one more bit
//
// Let FPSR3 be set to round towards zero with widest precision
// and exponent range. Unless an explicit FPSR is given,
// round-to-nearest with widest precision and exponent range is
// used.
//
// Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_C := 2^(-65).
//
// Tmp_C := fmpy.fpsr3( x, p_1 );
// If Tmp_C >= sigma_C then
//    C_hi := Tmp_C;
//    C_lo := x*p_1 - C_hi ...fma, exact
// Else
//    C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C
//                   ...subtraction is exact, regardless
//                   ...of rounding direction
//    C_lo := x*p_1 - C_hi ...fma, exact
// End If
//
// Tmp_B := fmpy.fpsr3( x, p_2 );
// If Tmp_B >= sigma_B then
//    B_hi := Tmp_B;
//    B_lo := x*p_2 - B_hi ...fma, exact
// Else
//    B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B
//                   ...subtraction is exact, regardless
//                   ...of rounding direction
//    B_lo := x*p_2 - B_hi ...fma, exact
// End If
//
// Tmp_A := fmpy.fpsr3( x, p_3 );
// If Tmp_A >= sigma_A then
//    A_hi := Tmp_A;
//    A_lo := x*p_3 - A_hi ...fma, exact
// Else
//    A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A
//                   ...subtraction is exact, regardless
//                   ...of rounding direction
//    A_lo := x*p_3 - A_hi ...fma, exact
// End If
//
// ...Note that C_hi is of integer value. We need only the
// ...last few bits. Thus we can ensure C_hi is never a big
// ...integer, freeing us from overflow worry.
//
// Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70);
// ...Tmp_C is the upper portion of C_hi
// C_hi := C_hi - Tmp_C
// ...0 <= C_hi < 2^7
//
// Step 2. Get N and f
// -------------------
//
// At this point, we have all the components to obtain
// S_0, S_1, S_2, S_3 and thus N and f. We start by adding
// C_lo and B_hi. This sum together with C_hi gives a good
// estimation of N and f.
//
// A := fadd.fpsr3( B_hi, C_lo )
// B := max( B_hi, C_lo )
// b := min( B_hi, C_lo )
//
// a := (B - A) + b      ...exact. Note that a is either 0
//                   ...or 2^(-64).
//
// N := round_to_nearest_integer_value( A );
// f := A - N;            ...exact because lsb(A) >= 2^(-64)
//                   ...and |f| <= 1/2.
//
// f := f + a            ...exact because a is 0 or 2^(-64);
//                   ...the msb of the sum is <= 1/2
//                   ...lsb >= 2^(-64).
//
// N := convert to integer format( C_hi + N );
// M := P_0 * x_lo;
// N := N + M;
//
// If sgn_x == 1 (that is original x was negative)
// N := 2^10 - N
// ...this maintains N to be non-negative, but still
// ...equivalent to the (negated N) mod 4.
// End If
//
// If |f| >= 2^(-33)
//
// ...Case 1
// CASE := 1
// g := A_hi + B_lo;
// s_hi := f + g;
// s_lo := (f - s_hi) + g;
//
// Else
//
// ...Case 2
// CASE := 2
// A := fadd.fpsr3( A_hi, B_lo )
// B := max( A_hi, B_lo )
// b := min( A_hi, B_lo )
//
// a := (B - A) + b      ...exact. Note that a is either 0
//                   ...or 2^(-128).
//
// f_hi := A + f;
// f_lo := (f - f_hi) + A;
// ...this is exact.
// ...f-f_hi is exact because either |f| >= |A|, in which
// ...case f-f_hi is clearly exact; or otherwise, 0<|f|<|A|
// ...means msb(f) <= msb(A) = 2^(-64) => |f| = 2^(-64).
// ...If f = 2^(-64), f-f_hi involves cancellation and is
// ...exact. If f = -2^(-64), then A + f is exact. Hence
// ...f-f_hi is -A exactly, giving f_lo = 0.
//
// f_lo := f_lo + a;
//
// If |f| >= 2^(-50) then
//    s_hi := f_hi;
//    s_lo := f_lo;
// Else
//    f_lo := (f_lo + A_lo) + x*p_4
//    s_hi := f_hi + f_lo
//    s_lo := (f_hi - s_hi) + f_lo
// End If
//
// End If
//
// Step 3. Get reduced argument
// ----------------------------
//
// If sgn_x == 0 (that is original x is positive)
//
// D_hi := Pi_by_2_hi
// D_lo := Pi_by_2_lo
// ...load from table
//
// Else
//
// D_hi := neg_Pi_by_2_hi
// D_lo := neg_Pi_by_2_lo
// ...load from table
// End If
//
// r_hi :=  s_hi*D_hi
// r_lo :=  s_hi*D_hi - r_hi         ...fma
// r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi
//
// Return  N, r_hi, r_lo
//
FR_input_X = f8
FR_r_hi    = f8
FR_r_lo    = f9

FR_X       = f32
FR_N       = f33
FR_p_1     = f34
FR_TWOM33  = f35
FR_TWOM50  = f36
FR_g       = f37
FR_p_2     = f38
FR_f       = f39
FR_s_lo    = f40
FR_p_3     = f41
FR_f_abs   = f42
FR_D_lo    = f43
FR_p_4     = f44
FR_D_hi    = f45
FR_Tmp2_C  = f46
FR_s_hi    = f47
FR_sigma_A = f48
FR_A       = f49
FR_sigma_B = f50
FR_B       = f51
FR_sigma_C = f52
FR_b       = f53
FR_ScaleP2 = f54
FR_ScaleP3 = f55
FR_ScaleP4 = f56
FR_Tmp_A   = f57
FR_Tmp_B   = f58
FR_Tmp_C   = f59
FR_A_hi    = f60
FR_f_hi    = f61
FR_RSHF    = f62
FR_A_lo    = f63
FR_B_hi    = f64
FR_a       = f65
FR_B_lo    = f66
FR_f_lo    = f67
FR_N_fix   = f68
FR_C_hi    = f69
FR_C_lo    = f70

GR_N       = r8
GR_Exp_x   = r36
GR_Temp    = r37
GR_BIASL63 = r38
GR_CASE    = r39
GR_x_lo    = r40
GR_sgn_x   = r41
GR_M       = r42
GR_BASE    = r43
GR_LENGTH1 = r44
GR_LENGTH2 = r45
GR_ASUB    = r46
GR_P_0     = r47
GR_P_1     = r48
GR_P_2     = r49
GR_P_3     = r50
GR_P_4     = r51
GR_START   = r52
GR_SEGMENT = r53
GR_A       = r54
GR_B       = r55
GR_C       = r56
GR_D       = r57
GR_E       = r58
GR_TEMP1   = r59
GR_TEMP2   = r60
GR_TEMP3   = r61
GR_TEMP4   = r62
GR_TEMP5   = r63
GR_TEMP6   = r64
GR_rshf    = r64

RODATA
.align 64

LOCAL_OBJECT_START(Constants_Bits_of_2_by_pi)
data8 0x0000000000000000,0xA2F9836E4E441529
data8 0xFC2757D1F534DDC0,0xDB6295993C439041
data8 0xFE5163ABDEBBC561,0xB7246E3A424DD2E0
data8 0x06492EEA09D1921C,0xFE1DEB1CB129A73E
data8 0xE88235F52EBB4484,0xE99C7026B45F7E41
data8 0x3991D639835339F4,0x9C845F8BBDF9283B
data8 0x1FF897FFDE05980F,0xEF2F118B5A0A6D1F
data8 0x6D367ECF27CB09B7,0x4F463F669E5FEA2D
data8 0x7527BAC7EBE5F17B,0x3D0739F78A5292EA
data8 0x6BFB5FB11F8D5D08,0x56033046FC7B6BAB
data8 0xF0CFBC209AF4361D,0xA9E391615EE61B08
data8 0x6599855F14A06840,0x8DFFD8804D732731
data8 0x06061556CA73A8C9,0x60E27BC08C6B47C4
data8 0x19C367CDDCE8092A,0x8359C4768B961CA6
data8 0xDDAF44D15719053E,0xA5FF07053F7E33E8
data8 0x32C2DE4F98327DBB,0xC33D26EF6B1E5EF8
data8 0x9F3A1F35CAF27F1D,0x87F121907C7C246A
data8 0xFA6ED5772D30433B,0x15C614B59D19C3C2
data8 0xC4AD414D2C5D000C,0x467D862D71E39AC6
data8 0x9B0062337CD2B497,0xA7B4D55537F63ED7
data8 0x1810A3FC764D2A9D,0x64ABD770F87C6357
data8 0xB07AE715175649C0,0xD9D63B3884A7CB23
data8 0x24778AD623545AB9,0x1F001B0AF1DFCE19
data8 0xFF319F6A1E666157,0x9947FBACD87F7EB7
data8 0x652289E83260BFE6,0xCDC4EF09366CD43F
data8 0x5DD7DE16DE3B5892,0x9BDE2822D2E88628
data8 0x4D58E232CAC616E3,0x08CB7DE050C017A7
data8 0x1DF35BE01834132E,0x6212830148835B8E
data8 0xF57FB0ADF2E91E43,0x4A48D36710D8DDAA
data8 0x425FAECE616AA428,0x0AB499D3F2A6067F
data8 0x775C83C2A3883C61,0x78738A5A8CAFBDD7
data8 0x6F63A62DCBBFF4EF,0x818D67C12645CA55
data8 0x36D9CAD2A8288D61,0xC277C9121426049B
data8 0x4612C459C444C5C8,0x91B24DF31700AD43
data8 0xD4E5492910D5FDFC,0xBE00CC941EEECE70
data8 0xF53E1380F1ECC3E7,0xB328F8C79405933E
data8 0x71C1B3092EF3450B,0x9C12887B20AB9FB5
data8 0x2EC292472F327B6D,0x550C90A7721FE76B
data8 0x96CB314A1679E279,0x4189DFF49794E884
data8 0xE6E29731996BED88,0x365F5F0EFDBBB49A
data8 0x486CA46742727132,0x5D8DB8159F09E5BC
data8 0x25318D3974F71C05,0x30010C0D68084B58
data8 0xEE2C90AA4702E774,0x24D6BDA67DF77248
data8 0x6EEF169FA6948EF6,0x91B45153D1F20ACF
data8 0x3398207E4BF56863,0xB25F3EDD035D407F
data8 0x8985295255C06437,0x10D86D324832754C
data8 0x5BD4714E6E5445C1,0x090B69F52AD56614
data8 0x9D072750045DDB3B,0xB4C576EA17F9877D
data8 0x6B49BA271D296996,0xACCCC65414AD6AE2
data8 0x9089D98850722CBE,0xA4049407777030F3
data8 0x27FC00A871EA49C2,0x663DE06483DD9797
data8 0x3FA3FD94438C860D,0xDE41319D39928C70
data8 0xDDE7B7173BDF082B,0x3715A0805C93805A
data8 0x921110D8E80FAF80,0x6C4BFFDB0F903876
data8 0x185915A562BBCB61,0xB989C7BD401004F2
data8 0xD2277549F6B6EBBB,0x22DBAA140A2F2689
data8 0x768364333B091A94,0x0EAA3A51C2A31DAE
data8 0xEDAF12265C4DC26D,0x9C7A2D9756C0833F
data8 0x03F6F0098C402B99,0x316D07B43915200C
data8 0x5BC3D8C492F54BAD,0xC6A5CA4ECD37A736
data8 0xA9E69492AB6842DD,0xDE6319EF8C76528B
data8 0x6837DBFCABA1AE31,0x15DFA1AE00DAFB0C
data8 0x664D64B705ED3065,0x29BF56573AFF47B9
data8 0xF96AF3BE75DF9328,0x3080ABF68C6615CB
data8 0x040622FA1DE4D9A4,0xB33D8F1B5709CD36
data8 0xE9424EA4BE13B523,0x331AAAF0A8654FA5
data8 0xC1D20F3F0BCD785B,0x76F923048B7B7217
data8 0x8953A6C6E26E6F00,0xEBEF584A9BB7DAC4
data8 0xBA66AACFCF761D02,0xD12DF1B1C1998C77
data8 0xADC3DA4886A05DF7,0xF480C62FF0AC9AEC
data8 0xDDBC5C3F6DDED01F,0xC790B6DB2A3A25A3
data8 0x9AAF009353AD0457,0xB6B42D297E804BA7
data8 0x07DA0EAA76A1597B,0x2A12162DB7DCFDE5
data8 0xFAFEDB89FDBE896C,0x76E4FCA90670803E
data8 0x156E85FF87FD073E,0x2833676186182AEA
data8 0xBD4DAFE7B36E6D8F,0x3967955BBF3148D7
data8 0x8416DF30432DC735,0x6125CE70C9B8CB30
data8 0xFD6CBFA200A4E46C,0x05A0DD5A476F21D2
data8 0x1262845CB9496170,0xE0566B0152993755
data8 0x50B7D51EC4F1335F,0x6E13E4305DA92E85
data8 0xC3B21D3632A1A4B7,0x08D4B1EA21F716E4
data8 0x698F77FF2780030C,0x2D408DA0CD4F99A5
data8 0x20D3A2B30A5D2F42,0xF9B4CBDA11D0BE7D
data8 0xC1DB9BBD17AB81A2,0xCA5C6A0817552E55
data8 0x0027F0147F8607E1,0x640B148D4196DEBE
data8 0x872AFDDAB6256B34,0x897BFEF3059EBFB9
data8 0x4F6A68A82A4A5AC4,0x4FBCF82D985AD795
data8 0xC7F48D4D0DA63A20,0x5F57A4B13F149538
data8 0x800120CC86DD71B6,0xDEC9F560BF11654D
data8 0x6B0701ACB08CD0C0,0xB24855510EFB1EC3
data8 0x72953B06A33540C0,0x7BDC06CC45E0FA29
data8 0x4EC8CAD641F3E8DE,0x647CD8649B31BED9
data8 0xC397A4D45877C5E3,0x6913DAF03C3ABA46
data8 0x18465F7555F5BDD2,0xC6926E5D2EACED44
data8 0x0E423E1C87C461E9,0xFD29F3D6E7CA7C22
data8 0x35916FC5E0088DD7,0xFFE26A6EC6FDB0C1
data8 0x0893745D7CB2AD6B,0x9D6ECD7B723E6A11
data8 0xC6A9CFF7DF7329BA,0xC9B55100B70DB2E2
data8 0x24BA74607DE58AD8,0x742C150D0C188194
data8 0x667E162901767A9F,0xBEFDFDEF4556367E
data8 0xD913D9ECB9BA8BFC,0x97C427A831C36EF1
data8 0x36C59456A8D8B5A8,0xB40ECCCF2D891234
data8 0x576F89562CE3CE99,0xB920D6AA5E6B9C2A
data8 0x3ECC5F114A0BFDFB,0xF4E16D3B8E2C86E2
data8 0x84D4E9A9B4FCD1EE,0xEFC9352E61392F44
data8 0x2138C8D91B0AFC81,0x6A4AFBD81C2F84B4
data8 0x538C994ECC2254DC,0x552AD6C6C096190B
data8 0xB8701A649569605A,0x26EE523F0F117F11
data8 0xB5F4F5CBFC2DBC34,0xEEBC34CC5DE8605E
data8 0xDD9B8E67EF3392B8,0x17C99B5861BC57E1
data8 0xC68351103ED84871,0xDDDD1C2DA118AF46
data8 0x2C21D7F359987AD9,0xC0549EFA864FFC06
data8 0x56AE79E536228922,0xAD38DC9367AAE855
data8 0x3826829BE7CAA40D,0x51B133990ED7A948
data8 0x0569F0B265A7887F,0x974C8836D1F9B392
data8 0x214A827B21CF98DC,0x9F405547DC3A74E1
data8 0x42EB67DF9DFE5FD4,0x5EA4677B7AACBAA2
data8 0xF65523882B55BA41,0x086E59862A218347
data8 0x39E6E389D49EE540,0xFB49E956FFCA0F1C
data8 0x8A59C52BFA94C5C1,0xD3CFC50FAE5ADB86
data8 0xC5476243853B8621,0x94792C8761107B4C
data8 0x2A1A2C8012BF4390,0x2688893C78E4C4A8
data8 0x7BDBE5C23AC4EAF4,0x268A67F7BF920D2B
data8 0xA365B1933D0B7CBD,0xDC51A463DD27DDE1
data8 0x6919949A9529A828,0xCE68B4ED09209F44
data8 0xCA984E638270237C,0x7E32B90F8EF5A7E7
data8 0x561408F1212A9DB5,0x4D7E6F5119A5ABF9
data8 0xB5D6DF8261DD9602,0x36169F3AC4A1A283
data8 0x6DED727A8D39A9B8,0x825C326B5B2746ED
data8 0x34007700D255F4FC,0x4D59018071E0E13F
data8 0x89B295F364A8F1AE,0xA74B38FC4CEAB2BB
LOCAL_OBJECT_END(Constants_Bits_of_2_by_pi)

LOCAL_OBJECT_START(Constants_Bits_of_pi_by_2)
data8 0xC90FDAA22168C234,0x00003FFF
data8 0xC4C6628B80DC1CD1,0x00003FBF
LOCAL_OBJECT_END(Constants_Bits_of_pi_by_2)

.section .text
.global __libm_pi_by_2_reduce#
.proc __libm_pi_by_2_reduce#
.align 32

__libm_pi_by_2_reduce:

//    X is in f8
//    Place the two-piece result r (r_hi) in f8 and c (r_lo) in f9
//    N is returned in r8

{ .mfi
      alloc  r34 = ar.pfs,2,34,0,0
      fsetc.s3 0x00,0x7F     // Set sf3 to round to zero, 82-bit prec, td, ftz
      nop.i 999
}
{ .mfi
      addl           GR_BASE   = @ltoff(Constants_Bits_of_2_by_pi#), gp
      nop.f 999
      mov GR_BIASL63 = 0x1003E
}
;;


//    L         -1-2-3-4
//    0 0 0 0 0. 1 0 1 0
//    M          0 1 2 .... 63, 64 65 ... 127, 128
//     ---------------------------------------------
//    Segment 0.        1     ,      2       ,    3
//    START = M - 63                        M = 128 becomes 65
//    LENGTH1  = START & 0x3F               65 become position 1
//    SEGMENT  = shr(START,6) + 1      0 maps to 1,   64 maps to 2,
//    LENGTH2  = 64 - LENGTH1
//    Address_BASE = shladd(SEGMENT,3) + BASE


{ .mmi
      getf.exp GR_Exp_x = FR_input_X
      ld8 GR_BASE = [GR_BASE]
      mov GR_TEMP5 = 0x0FFFE
}
;;

//    Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_A := 2^(-65).
{ .mmi
      getf.sig GR_x_lo = FR_input_X
      mov GR_TEMP6 = 0x0FFBE
      nop.i 999
}
;;

//    Special Code for testing DE arguments
//          movl GR_BIASL63 = 0x0000000000013FFE
//          movl GR_x_lo = 0xFFFFFFFFFFFFFFFF
//          setf.exp FR_X = GR_BIASL63
//          setf.sig FR_ScaleP3 = GR_x_lo
//          fmerge.se FR_X = FR_X,FR_ScaleP3
//    Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x.
//    2/pi is stored contiguously as
//    0x00000000 0x00000000.0xA2F....
//    M = EXP - BIAS  ( M >= 63)
//    Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m.
//    Thus -1 <= L <= -16321.
{ .mmi
      setf.exp FR_sigma_B = GR_TEMP5
      setf.exp FR_sigma_A = GR_TEMP6
      extr.u GR_M = GR_Exp_x,0,17
}
;;

{ .mii
      and  GR_x_lo = 0x03,GR_x_lo
      sub  GR_START = GR_M,GR_BIASL63
      add  GR_BASE = 8,GR_BASE           // To effectively add 1 to SEGMENT
}
;;

{ .mii
      and  GR_LENGTH1 = 0x3F,GR_START
      shr.u  GR_SEGMENT = GR_START,6
      nop.i 999
}
;;

{ .mmi
      shladd GR_BASE = GR_SEGMENT,3,GR_BASE
      sub  GR_LENGTH2 = 0x40,GR_LENGTH1
      cmp.le p6,p7 = 0x2,GR_LENGTH1
}
;;

//    P_0 is the two bits corresponding to bit positions L+2 and L+1
//    P_1 is the 64-bit starting at bit position  L
//    P_2 is the 64-bit starting at bit position  L-64
//    P_3 is the 64-bit starting at bit position  L-128
//    P_4 is the 64-bit starting at bit position  L-192
//    P_1 is made up of Alo and Bhi
//    P_1 = deposit Alo, position 0, length2  into P_1,position length1
//          deposit Bhi, position length2, length1 into P_1, position 0
//    P_2 is made up of Blo and Chi
//    P_2 = deposit Blo, position 0, length2  into P_2, position length1
//          deposit Chi, position length2, length1 into P_2, position 0
//    P_3 is made up of Clo and Dhi
//    P_3 = deposit Clo, position 0, length2  into P_3, position length1
//          deposit Dhi, position length2, length1 into P_3, position 0
//    P_4 is made up of Clo and Dhi
//    P_4 = deposit Dlo, position 0, length2  into P_4, position length1
//          deposit Ehi, position length2, length1 into P_4, position 0
{ .mfi
      ld8 GR_A = [GR_BASE],8
      fabs FR_X = FR_input_X
(p7)  cmp.eq.unc p8,p9 = 0x1,GR_LENGTH1
}
;;

//    ld_64 A at Base and increment Base by 8
//    ld_64 B at Base and increment Base by 8
//    ld_64 C at Base and increment Base by 8
//    ld_64 D at Base and increment Base by 8
//    ld_64 E at Base and increment Base by 8
//                                          A/B/C/D
//                                    ---------------------
//    A, B, C, D, and E look like    | length1 | length2   |
//                                    ---------------------
//                                       hi        lo
{ .mlx
      ld8 GR_B = [GR_BASE],8
      movl GR_rshf = 0x43e8000000000000   // 1.10000 2^63 for right shift N_fix
}
;;

{ .mmi
      ld8 GR_C = [GR_BASE],8
      nop.m 999
(p8)  extr.u GR_Temp = GR_A,63,1
}
;;

//    If length1 >= 2,
//       P_0 = deposit Ahi, position length2, 2 bit into P_0 at position 0.
{ .mii
      ld8 GR_D = [GR_BASE],8
      shl GR_TEMP1 = GR_A,GR_LENGTH1   // MM instruction
(p6)  shr.u GR_P_0 = GR_A,GR_LENGTH2   // MM instruction
}
;;

{ .mii
      ld8 GR_E = [GR_BASE],-40
      shl GR_TEMP2 = GR_B,GR_LENGTH1   // MM instruction
      shr.u GR_P_1 = GR_B,GR_LENGTH2   // MM instruction
}
;;

//    Else
//       Load 16 bit of ASUB from (Base_Address_of_A - 2)
//       P_0 = ASUB & 0x3
//       If length1 == 0,
//          P_0 complete
//       Else
//          Deposit element 63 from Ahi and place in element 0 of P_0.
//       Endif
//    Endif

{ .mii
(p7)  ld2 GR_ASUB = [GR_BASE],8
      shl GR_TEMP3 = GR_C,GR_LENGTH1   // MM instruction
      shr.u GR_P_2 = GR_C,GR_LENGTH2   // MM instruction
}
;;

{ .mii
      setf.d FR_RSHF = GR_rshf         // Form right shift const 1.100 * 2^63
      shl GR_TEMP4 = GR_D,GR_LENGTH1   // MM instruction
      shr.u GR_P_3 = GR_D,GR_LENGTH2   // MM instruction
}
;;

{ .mmi
(p7)  and GR_P_0 = 0x03,GR_ASUB
(p6)  and GR_P_0 = 0x03,GR_P_0
      shr.u GR_P_4 = GR_E,GR_LENGTH2   // MM instruction
}
;;

{ .mmi
      nop.m 999
      or GR_P_1 = GR_P_1,GR_TEMP1
(p8)  and GR_P_0 = 0x1,GR_P_0
}
;;

{ .mmi
      setf.sig FR_p_1 = GR_P_1
      or GR_P_2 = GR_P_2,GR_TEMP2
(p8)  shladd GR_P_0 = GR_P_0,1,GR_Temp
}
;;

{ .mmf
      setf.sig FR_p_2 = GR_P_2
      or GR_P_3 = GR_P_3,GR_TEMP3
      fmerge.se FR_X = FR_sigma_B,FR_X
}
;;

{ .mmi
      setf.sig FR_p_3 = GR_P_3
      or GR_P_4 = GR_P_4,GR_TEMP4
      pmpy2.r GR_M = GR_P_0,GR_x_lo
}
;;

//    P_1, P_2, P_3, P_4 are integers. They should be
//    2^(L-63)     * P_1;
//    2^(L-63-64)  * P_2;
//    2^(L-63-128) * P_3;
//    2^(L-63-192) * P_4;
//    Since each of them need to be multiplied to x, we would scale
//    both x and the P_j's by some convenient factors: scale each
//    of P_j's up by 2^(63-L), and scale x down by 2^(L-63).
//    p_1 := fcvt.xf ( P_1 )
//    p_2 := fcvt.xf ( P_2 ) * 2^(-64)
//    p_3 := fcvt.xf ( P_3 ) * 2^(-128)
//    p_4 := fcvt.xf ( P_4 ) * 2^(-192)
//    x= Set x's exp to -1 because 2^m*1.x...x *2^(L-63)=2^(-1)*1.x...xxx
//             ---------   ---------   ---------
//             |  P_1  |   |  P_2  |   |  P_3  |
//             ---------   ---------   ---------
//                                           ---------
//            X                              |   X   |
//                                           ---------
//      ----------------------------------------------------
//                               ---------   ---------
//                               |  A_hi |   |  A_lo |
//                               ---------   ---------
//                   ---------   ---------
//                   |  B_hi |   |  B_lo |
//                   ---------   ---------
//       ---------   ---------
//       |  C_hi |   |  C_lo |
//       ---------   ---------
//     ====================================================
//    -----------   ---------   ---------   ---------
//    |    S_0  |   |  S_1  |   |  S_2  |   |  S_3  |
//    -----------   ---------   ---------   ---------
//    |            |___ binary point
//    |___ possibly one more bit
//
//    Let FPSR3 be set to round towards zero with widest precision
//    and exponent range. Unless an explicit FPSR is given,
//    round-to-nearest with widest precision and exponent range is
//    used.
{ .mmi
      setf.sig FR_p_4 = GR_P_4
      mov GR_TEMP1 = 0x0FFBF
      nop.i 999
}
;;

{ .mmi
      setf.exp FR_ScaleP2 = GR_TEMP1
      mov GR_TEMP2 = 0x0FF7F
      nop.i 999
}
;;

{ .mmi
      setf.exp FR_ScaleP3 = GR_TEMP2
      mov GR_TEMP4 = 0x1003E
      nop.i 999
}
;;

{ .mmf
      setf.exp FR_sigma_C = GR_TEMP4
      mov GR_Temp = 0x0FFDE
      fcvt.xuf.s1 FR_p_1 = FR_p_1
}
;;

{ .mfi
      setf.exp FR_TWOM33 = GR_Temp
      fcvt.xuf.s1 FR_p_2 = FR_p_2
      nop.i 999
}
;;

{ .mfi
      nop.m 999
      fcvt.xuf.s1 FR_p_3 = FR_p_3
      nop.i 999
}
;;

{ .mfi
      nop.m 999
      fcvt.xuf.s1 FR_p_4 = FR_p_4
      nop.i 999
}
;;

//    Tmp_C := fmpy.fpsr3( x, p_1 );
//    Tmp_B := fmpy.fpsr3( x, p_2 );
//    Tmp_A := fmpy.fpsr3( x, p_3 );
//    If Tmp_C >= sigma_C then
//      C_hi := Tmp_C;
//      C_lo := x*p_1 - C_hi ...fma, exact
//    Else
//      C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C
//      C_lo := x*p_1 - C_hi ...fma, exact
//    End If
//    If Tmp_B >= sigma_B then
//      B_hi := Tmp_B;
//      B_lo := x*p_2 - B_hi ...fma, exact
//    Else
//      B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B
//      B_lo := x*p_2 - B_hi ...fma, exact
//    End If
//    If Tmp_A >= sigma_A then
//      A_hi := Tmp_A;
//      A_lo := x*p_3 - A_hi ...fma, exact
//    Else
//      A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A
//      Exact, regardless ...of rounding direction
//      A_lo := x*p_3 - A_hi ...fma, exact
//    Endif
{ .mfi
      nop.m 999
      fmpy.s3 FR_Tmp_C = FR_X,FR_p_1
      nop.i 999
}
;;

{ .mfi
      mov GR_TEMP3 = 0x0FF3F
      fmpy.s1 FR_p_2 = FR_p_2,FR_ScaleP2
      nop.i 999
}
;;

{ .mmf
      setf.exp FR_ScaleP4 = GR_TEMP3
      mov GR_TEMP4 = 0x10045
      fmpy.s1 FR_p_3 = FR_p_3,FR_ScaleP3
}
;;

{ .mfi
      nop.m 999
      fadd.s3 FR_C_hi = FR_sigma_C,FR_Tmp_C   // For Tmp_C < sigma_C case
      nop.i 999
}
;;

{ .mmf
      setf.exp FR_Tmp2_C = GR_TEMP4
      nop.m 999
      fmpy.s3 FR_Tmp_B = FR_X,FR_p_2
}
;;

{ .mfi
      addl           GR_BASE   = @ltoff(Constants_Bits_of_pi_by_2#), gp
      fcmp.ge.s1 p12,  p9 = FR_Tmp_C,FR_sigma_C
      nop.i 999
}
{ .mfi
      nop.m 999
      fmpy.s3 FR_Tmp_A = FR_X,FR_p_3
      nop.i 99
}
;;

{ .mfi
      ld8 GR_BASE = [GR_BASE]
(p12) mov FR_C_hi = FR_Tmp_C
      nop.i 999
}
{ .mfi
      nop.m 999
(p9)  fsub.s1 FR_C_hi = FR_C_hi,FR_sigma_C
      nop.i 999
}
;;



//   End If
//   Step 3. Get reduced argument
//   If sgn_x == 0 (that is original x is positive)
//      D_hi := Pi_by_2_hi
//      D_lo := Pi_by_2_lo
//      Load from table
//   Else
//      D_hi := neg_Pi_by_2_hi
//      D_lo := neg_Pi_by_2_lo
//      Load from table
//   End If

{ .mfi
      nop.m 999
      fmpy.s1 FR_p_4 = FR_p_4,FR_ScaleP4
      nop.i 999
}
{ .mfi
      nop.m 999
      fadd.s3 FR_B_hi = FR_sigma_B,FR_Tmp_B     // For Tmp_B < sigma_B case
      nop.i 999
}
;;

{ .mfi
      nop.m 999
      fadd.s3 FR_A_hi = FR_sigma_A,FR_Tmp_A     // For Tmp_A < sigma_A case
      nop.i 999
}
;;

{ .mfi
      nop.m 999
      fcmp.ge.s1 p13, p10 = FR_Tmp_B,FR_sigma_B
      nop.i 999
}
{ .mfi
      nop.m 999
      fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi
      nop.i 999
}
;;

{ .mfi
      ldfe FR_D_hi = [GR_BASE],16
      fcmp.ge.s1 p14, p11 = FR_Tmp_A,FR_sigma_A
      nop.i 999
}
;;

{ .mfi
      ldfe FR_D_lo = [GR_BASE]
(p13) mov FR_B_hi = FR_Tmp_B
      nop.i 999
}
{ .mfi
      nop.m 999
(p10) fsub.s1 FR_B_hi = FR_B_hi,FR_sigma_B
      nop.i 999
}
;;

{ .mfi
      nop.m 999
(p14) mov FR_A_hi = FR_Tmp_A
      nop.i 999
}
{ .mfi
      nop.m 999
(p11) fsub.s1 FR_A_hi = FR_A_hi,FR_sigma_A
      nop.i 999
}
;;

//    Note that C_hi is of integer value. We need only the
//    last few bits. Thus we can ensure C_hi is never a big
//    integer, freeing us from overflow worry.
//    Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70);
//    Tmp_C is the upper portion of C_hi
{ .mfi
      nop.m 999
      fadd.s3 FR_Tmp_C = FR_C_hi,FR_Tmp2_C
      tbit.z p12,p9 = GR_Exp_x, 17
}
;;

{ .mfi
      nop.m 999
      fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi
      nop.i 999
}
{ .mfi
      nop.m 999
      fadd.s3 FR_A = FR_B_hi,FR_C_lo
      nop.i 999
}
;;

{ .mfi
      nop.m 999
      fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi
      nop.i 999
}
;;

{ .mfi
      nop.m 999
      fsub.s1 FR_Tmp_C = FR_Tmp_C,FR_Tmp2_C
      nop.i 999
}
;;

//    *******************
//    Step 2. Get N and f
//    *******************
//    We have all the components to obtain
//    S_0, S_1, S_2, S_3 and thus N and f. We start by adding
//    C_lo and B_hi. This sum together with C_hi estimates
//    N and f well.
//    A := fadd.fpsr3( B_hi, C_lo )
//    B := max( B_hi, C_lo )
//    b := min( B_hi, C_lo )
{ .mfi
      nop.m 999
      fmax.s1 FR_B = FR_B_hi,FR_C_lo
      nop.i 999
}
;;

// We use a right-shift trick to get the integer part of A into the rightmost
// bits of the significand by adding 1.1000..00 * 2^63.  This operation is good
// if |A| < 2^61, which it is in this case.  We are doing this to save a few
// cycles over using fcvt.fx followed by fnorm.  The second step of the trick
// is to subtract the same constant to float the rounded integer into a fp reg.

{ .mfi
      nop.m 999
//    N := round_to_nearest_integer_value( A );
      fma.s1 FR_N_fix = FR_A, f1, FR_RSHF
      nop.i 999
}
;;

{ .mfi
      nop.m 999
      fmin.s1 FR_b = FR_B_hi,FR_C_lo
      nop.i 999
}
{ .mfi
      nop.m 999
//    C_hi := C_hi - Tmp_C ...0 <= C_hi < 2^7
      fsub.s1 FR_C_hi = FR_C_hi,FR_Tmp_C
      nop.i 999
}
;;

{ .mfi
      nop.m 999
//    a := (B - A) + b: Exact - note that a is either 0 or 2^(-64).
      fsub.s1 FR_a = FR_B,FR_A
      nop.i 999
}
;;

{ .mfi
      nop.m 999
      fms.s1 FR_N = FR_N_fix, f1, FR_RSHF
      nop.i 999
}
;;

{ .mfi
      nop.m 999
      fadd.s1 FR_a = FR_a,FR_b
      nop.i 999
}
;;

//    f := A - N; Exact because lsb(A) >= 2^(-64) and |f| <= 1/2.
//    N := convert to integer format( C_hi + N );
//    M := P_0 * x_lo;
//    N := N + M;
{ .mfi
      nop.m 999
      fsub.s1 FR_f = FR_A,FR_N
      nop.i 999
}
{ .mfi
      nop.m 999
      fadd.s1 FR_N = FR_N,FR_C_hi
      nop.i 999
}
;;

{ .mfi
      nop.m 999
(p9)  fsub.s1 FR_D_hi = f0, FR_D_hi
      nop.i 999
}
{ .mfi
      nop.m 999
(p9)  fsub.s1 FR_D_lo = f0, FR_D_lo
      nop.i 999
}
;;

{ .mfi
      nop.m 999
      fadd.s1 FR_g = FR_A_hi,FR_B_lo          // For Case 1, g=A_hi+B_lo
      nop.i 999
}
{ .mfi
      nop.m 999
      fadd.s3 FR_A = FR_A_hi,FR_B_lo          // For Case 2, A=A_hi+B_lo w/ sf3
      nop.i 999
}
;;

{ .mfi
      mov GR_Temp = 0x0FFCD                   // For Case 2, exponent of 2^-50
      fmax.s1 FR_B = FR_A_hi,FR_B_lo          // For Case 2, B=max(A_hi,B_lo)
      nop.i 999
}
;;

//    f = f + a      Exact because a is 0 or 2^(-64);
//    the msb of the sum is <= 1/2 and lsb >= 2^(-64).
{ .mfi
      setf.exp FR_TWOM50 = GR_Temp            // For Case 2, form 2^-50
      fcvt.fx.s1 FR_N = FR_N
      nop.i 999
}
{ .mfi
      nop.m 999
      fadd.s1 FR_f = FR_f,FR_a
      nop.i 999
}
;;

{ .mfi
      nop.m 999
      fmin.s1 FR_b = FR_A_hi,FR_B_lo          // For Case 2, b=min(A_hi,B_lo)
      nop.i 999
}
;;

{ .mfi
      nop.m 999
      fsub.s1 FR_a = FR_B,FR_A                // For Case 2, a=B-A
      nop.i 999
}
;;

{ .mfi
      nop.m 999
      fadd.s1 FR_s_hi = FR_f,FR_g             // For Case 1, s_hi=f+g
      nop.i 999
}
{ .mfi
      nop.m 999
      fadd.s1 FR_f_hi = FR_A,FR_f             // For Case 2, f_hi=A+f
      nop.i 999
}
;;

{ .mfi
      nop.m 999
      fabs FR_f_abs = FR_f
      nop.i 999
}
;;

{ .mfi
      getf.sig GR_N = FR_N
      fsetc.s3 0x7F,0x40                 // Reset sf3 to user settings + td
      nop.i 999
}
;;

{ .mfi
      nop.m 999
      fsub.s1 FR_s_lo = FR_f,FR_s_hi          // For Case 1, s_lo=f-s_hi
      nop.i 999
}
{ .mfi
      nop.m 999
      fsub.s1 FR_f_lo = FR_f,FR_f_hi          // For Case 2, f_lo=f-f_hi
      nop.i 999
}
;;

{ .mfi
      nop.m 999
      fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi       // For Case 1, r_hi=s_hi*D_hi
      nop.i 999
}
{ .mfi
      nop.m 999
      fadd.s1 FR_a = FR_a,FR_b                // For Case 2, a=a+b
      nop.i 999
}
;;


//    If sgn_x == 1 (that is original x was negative)
//       N := 2^10 - N
//       this maintains N to be non-negative, but still
//       equivalent to the (negated N) mod 4.
//    End If
{ .mfi
      add GR_N = GR_N,GR_M
      fcmp.ge.s1 p13, p10 = FR_f_abs,FR_TWOM33
      mov GR_Temp = 0x00400
}
;;

{ .mfi
(p9)  sub GR_N = GR_Temp,GR_N
      fadd.s1 FR_s_lo = FR_s_lo,FR_g           // For Case 1, s_lo=s_lo+g
      nop.i 999
}
{ .mfi
      nop.m 999
      fadd.s1 FR_f_lo = FR_f_lo,FR_A           // For Case 2, f_lo=f_lo+A
      nop.i 999
}
;;

//       a := (B - A) + b      Exact.
//       Note that a is either 0 or 2^(-128).
//       f_hi := A + f;
//       f_lo := (f - f_hi) + A
//       f_lo=f-f_hi is exact because either |f| >= |A|, in which
//       case f-f_hi is clearly exact; or otherwise, 0<|f|<|A|
//       means msb(f) <= msb(A) = 2^(-64) => |f| = 2^(-64).
//       If f = 2^(-64), f-f_hi involves cancellation and is
//       exact. If f = -2^(-64), then A + f is exact. Hence
//       f-f_hi is -A exactly, giving f_lo = 0.
//       f_lo := f_lo + a;

//    If |f| >= 2^(-33)
//       Case 1
//       CASE := 1
//       g := A_hi + B_lo;
//       s_hi := f + g;
//       s_lo := (f - s_hi) + g;
//   Else
//       Case 2
//       CASE := 2
//       A := fadd.fpsr3( A_hi, B_lo )
//       B := max( A_hi, B_lo )
//       b := min( A_hi, B_lo )

{ .mfi
      nop.m 999
(p10) fcmp.ge.unc.s1 p14, p11 = FR_f_abs,FR_TWOM50
      nop.i 999
}
{ .mfi
      nop.m 999
(p13) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi //For Case 1, r_lo=s_hi*D_hi+r_hi
      nop.i 999
}
;;

//       If |f| >= 2^(-50) then
//          s_hi := f_hi;
//          s_lo := f_lo;
//       Else
//          f_lo := (f_lo + A_lo) + x*p_4
//          s_hi := f_hi + f_lo
//          s_lo := (f_hi - s_hi) + f_lo
//       End If
{ .mfi
      nop.m 999
(p14) mov FR_s_hi = FR_f_hi
      nop.i 999
}
{ .mfi
      nop.m 999
(p10) fadd.s1 FR_f_lo = FR_f_lo,FR_a
      nop.i 999
}
;;

{ .mfi
      nop.m 999
(p14) mov FR_s_lo = FR_f_lo
      nop.i 999
}
{ .mfi
      nop.m 999
(p11) fadd.s1 FR_f_lo = FR_f_lo,FR_A_lo
      nop.i 999
}
;;

{ .mfi
      nop.m 999
(p11) fma.s1 FR_f_lo = FR_X,FR_p_4,FR_f_lo
      nop.i 999
}
;;

{ .mfi
      nop.m 999
(p13) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo //For Case 1, r_lo=s_hi*D_lo+r_lo
      nop.i 999
}
{ .mfi
      nop.m 999
(p11) fadd.s1 FR_s_hi = FR_f_hi,FR_f_lo
      nop.i 999
}
;;

//   r_hi :=  s_hi*D_hi
//   r_lo :=  s_hi*D_hi - r_hi  with fma
//   r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi
{ .mfi
      nop.m 999
(p10) fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi
      nop.i 999
}
{ .mfi
      nop.m 999
(p11) fsub.s1 FR_s_lo = FR_f_hi,FR_s_hi
      nop.i 999
}
;;

{ .mfi
      nop.m 999
(p10) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi
      nop.i 999
}
{ .mfi
      nop.m 999
(p11) fadd.s1 FR_s_lo = FR_s_lo,FR_f_lo
      nop.i 999
}
;;

{ .mfi
      nop.m 999
(p10) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo
      nop.i 999
}
;;

//   Return  N, r_hi, r_lo
//   We do not return CASE
{ .mfb
      nop.m 999
      fma.s1 FR_r_lo = FR_s_lo,FR_D_hi,FR_r_lo
      br.ret.sptk   b0
}
;;

.endp __libm_pi_by_2_reduce#