View Javadoc
1   /*
2    *  Licensed under the Apache License, Version 2.0 (the "License");
3    *  you may not use this file except in compliance with the License.
4    *  You may obtain a copy of the License at
5    *
6    *       http://www.apache.org/licenses/LICENSE-2.0
7    *
8    *  Unless required by applicable law or agreed to in writing, software
9    *  distributed under the License is distributed on an "AS IS" BASIS,
10   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11   *  See the License for the specific language governing permissions and
12   *  limitations under the License.
13   *  under the License.
14   */
15  
16  package org.apache.commons.imaging.formats.jpeg.decoder;
17  
18  final class Dct {
19      /*
20       * The book "JPEG still image data compression standard", by Pennebaker and Mitchell, Chapter 4, discusses a number of approaches to the fast DCT. Here's
21       * the cost, excluding modified (de)quantization, for transforming an 8x8 block:
22       *
23       * Algorithm Adds Multiplies RightShifts Total Naive 896 1024 0 1920 "Symmetries" 448 224 0 672 Vetterli and Ligtenberg 464 208 0 672 Arai, Agui and
24       * Nakajima (AA&N) 464 80 0 544 Feig 8x8 462 54 6 522 Fused mul/add (a pipe dream) 416
25       *
26       * IJG's libjpeg, FFmpeg, and a number of others use AA&N.
27       *
28       * It would appear that Feig does 4-5% less operations, and multiplications are reduced from 80 in AA&N to only 54. But in practice:
29       *
30       * Benchmarks, Intel Core i3 @ 2.93 GHz in long mode, 4 GB RAM Time taken to do 100 million IDCTs (less is better): Rene' Stöckel's Feig, int: 45.07
31       * seconds My Feig, floating point: 36.252 seconds AA&N, unrolled loops, double[][] -> double[][]: 25.167 seconds
32       *
33       * Clearly Feig is hopeless. I suspect the performance killer is simply the weight of the algorithm: massive number of local variables, large code size, and
34       * lots of random array accesses.
35       *
36       * Also, AA&N can be optimized a lot: AA&N, rolled loops, double[][] -> double[][]: 21.162 seconds AA&N, rolled loops, float[][] -> float[][]: no
37       * improvement, but at some stage Hotspot might start doing SIMD, so let's use float AA&N, rolled loops, float[] -> float[][]: 19.979 seconds apparently 2D
38       * arrays are slow! AA&N, rolled loops, inlined 1D AA&N transform, float[] transformed in-place: 18.5 seconds AA&N, previous version rewritten in C and
39       * compiled with "gcc -O3" takes: 8.5 seconds (probably due to heavy use of SIMD)
40       *
41       * Other brave attempts: AA&N, best float version converted to 16:16 fixed point: 23.923 seconds
42       *
43       * Anyway the best float version stays. 18.5 seconds = 5.4 million transforms per second per core :-)
44       */
45  
46      private static final float[] DCT_SCALING_FACTORS = { (float) (0.5 / Math.sqrt(2.0)), (float) (0.25 / Math.cos(Math.PI / 16.0)),
47              (float) (0.25 / Math.cos(2.0 * Math.PI / 16.0)), (float) (0.25 / Math.cos(3.0 * Math.PI / 16.0)), (float) (0.25 / Math.cos(4.0 * Math.PI / 16.0)),
48              (float) (0.25 / Math.cos(5.0 * Math.PI / 16.0)), (float) (0.25 / Math.cos(6.0 * Math.PI / 16.0)),
49              (float) (0.25 / Math.cos(7.0 * Math.PI / 16.0)), };
50  
51      private static final float[] IDCT_SCALING_FACTORS = { (float) (2.0 * 4.0 / Math.sqrt(2.0) * 0.0625), (float) (4.0 * Math.cos(Math.PI / 16.0) * 0.125),
52              (float) (4.0 * Math.cos(2.0 * Math.PI / 16.0) * 0.125), (float) (4.0 * Math.cos(3.0 * Math.PI / 16.0) * 0.125),
53              (float) (4.0 * Math.cos(4.0 * Math.PI / 16.0) * 0.125), (float) (4.0 * Math.cos(5.0 * Math.PI / 16.0) * 0.125),
54              (float) (4.0 * Math.cos(6.0 * Math.PI / 16.0) * 0.125), (float) (4.0 * Math.cos(7.0 * Math.PI / 16.0) * 0.125), };
55  
56      private static final float A1 = (float) Math.cos(2.0 * Math.PI / 8.0);
57      private static final float A2 = (float) (Math.cos(Math.PI / 8.0) - Math.cos(3.0 * Math.PI / 8.0));
58      private static final float A3 = A1;
59      private static final float A4 = (float) (Math.cos(Math.PI / 8.0) + Math.cos(3.0 * Math.PI / 8.0));
60      private static final float A5 = (float) Math.cos(3.0 * Math.PI / 8.0);
61  
62      private static final float C2 = (float) (2.0 * Math.cos(Math.PI / 8));
63      private static final float C4 = (float) (2.0 * Math.cos(2 * Math.PI / 8));
64      private static final float C6 = (float) (2.0 * Math.cos(3 * Math.PI / 8));
65      private static final float Q = C2 - C6;
66      private static final float R = C2 + C6;
67  
68      /**
69       * Fast forward Dct using AA&N. Taken from the book "JPEG still image data compression standard", by Pennebaker and Mitchell, chapter 4, figure "4-8".
70       *
71       * @param vector vector.
72       */
73      public static void forwardDct8(final float[] vector) {
74          final float a00 = vector[0] + vector[7];
75          final float a10 = vector[1] + vector[6];
76          final float a20 = vector[2] + vector[5];
77          final float a30 = vector[3] + vector[4];
78          final float a40 = vector[3] - vector[4];
79          final float a50 = vector[2] - vector[5];
80          final float a60 = vector[1] - vector[6];
81          final float a70 = vector[0] - vector[7];
82  
83          final float a01 = a00 + a30;
84          final float a11 = a10 + a20;
85          final float a21 = a10 - a20;
86          final float a31 = a00 - a30;
87          // Avoid some negations:
88          // float a41 = -a40 - a50;
89          final float neg_a41 = a40 + a50;
90          final float a51 = a50 + a60;
91          final float a61 = a60 + a70;
92  
93          final float a22 = a21 + a31;
94  
95          final float a23 = a22 * A1;
96          final float mul5 = (a61 - neg_a41) * A5;
97          final float a43 = neg_a41 * A2 - mul5;
98          final float a53 = a51 * A3;
99          final float a63 = a61 * A4 - mul5;
100 
101         final float a54 = a70 + a53;
102         final float a74 = a70 - a53;
103 
104         vector[0] = a01 + a11;
105         vector[4] = a01 - a11;
106         vector[2] = a31 + a23;
107         vector[6] = a31 - a23;
108         vector[5] = a74 + a43;
109         vector[1] = a54 + a63;
110         vector[7] = a54 - a63;
111         vector[3] = a74 - a43;
112     }
113 
114     public static void forwardDct8x8(final float[] matrix) {
115         float a00, a10, a20, a30, a40, a50, a60, a70;
116         float a01, a11, a21, a31, neg_a41, a51, a61;
117         float a22, a23, mul5, a43, a53, a63;
118         float a54, a74;
119 
120         for (int i = 0; i < 8; i++) {
121             a00 = matrix[8 * i] + matrix[8 * i + 7];
122             a10 = matrix[8 * i + 1] + matrix[8 * i + 6];
123             a20 = matrix[8 * i + 2] + matrix[8 * i + 5];
124             a30 = matrix[8 * i + 3] + matrix[8 * i + 4];
125             a40 = matrix[8 * i + 3] - matrix[8 * i + 4];
126             a50 = matrix[8 * i + 2] - matrix[8 * i + 5];
127             a60 = matrix[8 * i + 1] - matrix[8 * i + 6];
128             a70 = matrix[8 * i] - matrix[8 * i + 7];
129             a01 = a00 + a30;
130             a11 = a10 + a20;
131             a21 = a10 - a20;
132             a31 = a00 - a30;
133             neg_a41 = a40 + a50;
134             a51 = a50 + a60;
135             a61 = a60 + a70;
136             a22 = a21 + a31;
137             a23 = a22 * A1;
138             mul5 = (a61 - neg_a41) * A5;
139             a43 = neg_a41 * A2 - mul5;
140             a53 = a51 * A3;
141             a63 = a61 * A4 - mul5;
142             a54 = a70 + a53;
143             a74 = a70 - a53;
144             matrix[8 * i] = a01 + a11;
145             matrix[8 * i + 4] = a01 - a11;
146             matrix[8 * i + 2] = a31 + a23;
147             matrix[8 * i + 6] = a31 - a23;
148             matrix[8 * i + 5] = a74 + a43;
149             matrix[8 * i + 1] = a54 + a63;
150             matrix[8 * i + 7] = a54 - a63;
151             matrix[8 * i + 3] = a74 - a43;
152         }
153 
154         for (int i = 0; i < 8; i++) {
155             a00 = matrix[i] + matrix[56 + i];
156             a10 = matrix[8 + i] + matrix[48 + i];
157             a20 = matrix[16 + i] + matrix[40 + i];
158             a30 = matrix[24 + i] + matrix[32 + i];
159             a40 = matrix[24 + i] - matrix[32 + i];
160             a50 = matrix[16 + i] - matrix[40 + i];
161             a60 = matrix[8 + i] - matrix[48 + i];
162             a70 = matrix[i] - matrix[56 + i];
163             a01 = a00 + a30;
164             a11 = a10 + a20;
165             a21 = a10 - a20;
166             a31 = a00 - a30;
167             neg_a41 = a40 + a50;
168             a51 = a50 + a60;
169             a61 = a60 + a70;
170             a22 = a21 + a31;
171             a23 = a22 * A1;
172             mul5 = (a61 - neg_a41) * A5;
173             a43 = neg_a41 * A2 - mul5;
174             a53 = a51 * A3;
175             a63 = a61 * A4 - mul5;
176             a54 = a70 + a53;
177             a74 = a70 - a53;
178             matrix[i] = a01 + a11;
179             matrix[32 + i] = a01 - a11;
180             matrix[16 + i] = a31 + a23;
181             matrix[48 + i] = a31 - a23;
182             matrix[40 + i] = a74 + a43;
183             matrix[8 + i] = a54 + a63;
184             matrix[56 + i] = a54 - a63;
185             matrix[24 + i] = a74 - a43;
186         }
187     }
188 
189     /**
190      * Fast inverse Dct using AA&N. This is taken from the beautiful [BROEKN URL] http://vsr.finermatik.tu-chemnitz.de/~jan/MPEG/HTML/IDCT.html which gives easy
191      * equations and properly explains constants and scaling factors. Terms have been inlined and the negation optimized out of existence.
192      *
193      * @param vector vector.
194      */
195     public static void inverseDct8(final float[] vector) {
196         // B1
197         final float a2 = vector[2] - vector[6];
198         final float a3 = vector[2] + vector[6];
199         final float a4 = vector[5] - vector[3];
200         final float tmp1 = vector[1] + vector[7];
201         final float tmp2 = vector[3] + vector[5];
202         final float a5 = tmp1 - tmp2;
203         final float a6 = vector[1] - vector[7];
204         final float a7 = tmp1 + tmp2;
205 
206         // M
207         final float tmp4 = C6 * (a4 + a6);
208         // Eliminate the negative:
209         // float b4 = -Q*a4 - tmp4;
210         final float neg_b4 = Q * a4 + tmp4;
211         final float b6 = R * a6 - tmp4;
212         final float b2 = a2 * C4;
213         final float b5 = a5 * C4;
214 
215         // A1
216         final float tmp3 = b6 - a7;
217         final float n0 = tmp3 - b5;
218         final float n1 = vector[0] - vector[4];
219         final float n2 = b2 - a3;
220         final float n3 = vector[0] + vector[4];
221         final float neg_n5 = neg_b4;
222 
223         // A2
224         final float m3 = n1 + n2;
225         final float m4 = n3 + a3;
226         final float m5 = n1 - n2;
227         final float m6 = n3 - a3;
228         // float m7 = n5 - n0;
229         final float neg_m7 = neg_n5 + n0;
230 
231         // A3
232         vector[0] = m4 + a7;
233         vector[1] = m3 + tmp3;
234         vector[2] = m5 - n0;
235         vector[3] = m6 + neg_m7;
236         vector[4] = m6 - neg_m7;
237         vector[5] = m5 + n0;
238         vector[6] = m3 - tmp3;
239         vector[7] = m4 - a7;
240     }
241 
242     public static void inverseDct8x8(final float[] matrix) {
243         float a2, a3, a4, tmp1, tmp2, a5, a6, a7;
244         float tmp4, neg_b4, b6, b2, b5;
245         float tmp3, n0, n1, n2, n3, neg_n5;
246         float m3, m4, m5, m6, neg_m7;
247 
248         for (int i = 0; i < 8; i++) {
249             a2 = matrix[8 * i + 2] - matrix[8 * i + 6];
250             a3 = matrix[8 * i + 2] + matrix[8 * i + 6];
251             a4 = matrix[8 * i + 5] - matrix[8 * i + 3];
252             tmp1 = matrix[8 * i + 1] + matrix[8 * i + 7];
253             tmp2 = matrix[8 * i + 3] + matrix[8 * i + 5];
254             a5 = tmp1 - tmp2;
255             a6 = matrix[8 * i + 1] - matrix[8 * i + 7];
256             a7 = tmp1 + tmp2;
257             tmp4 = C6 * (a4 + a6);
258             neg_b4 = Q * a4 + tmp4;
259             b6 = R * a6 - tmp4;
260             b2 = a2 * C4;
261             b5 = a5 * C4;
262             tmp3 = b6 - a7;
263             n0 = tmp3 - b5;
264             n1 = matrix[8 * i] - matrix[8 * i + 4];
265             n2 = b2 - a3;
266             n3 = matrix[8 * i] + matrix[8 * i + 4];
267             neg_n5 = neg_b4;
268             m3 = n1 + n2;
269             m4 = n3 + a3;
270             m5 = n1 - n2;
271             m6 = n3 - a3;
272             neg_m7 = neg_n5 + n0;
273             matrix[8 * i] = m4 + a7;
274             matrix[8 * i + 1] = m3 + tmp3;
275             matrix[8 * i + 2] = m5 - n0;
276             matrix[8 * i + 3] = m6 + neg_m7;
277             matrix[8 * i + 4] = m6 - neg_m7;
278             matrix[8 * i + 5] = m5 + n0;
279             matrix[8 * i + 6] = m3 - tmp3;
280             matrix[8 * i + 7] = m4 - a7;
281         }
282 
283         for (int i = 0; i < 8; i++) {
284             a2 = matrix[16 + i] - matrix[48 + i];
285             a3 = matrix[16 + i] + matrix[48 + i];
286             a4 = matrix[40 + i] - matrix[24 + i];
287             tmp1 = matrix[8 + i] + matrix[56 + i];
288             tmp2 = matrix[24 + i] + matrix[40 + i];
289             a5 = tmp1 - tmp2;
290             a6 = matrix[8 + i] - matrix[56 + i];
291             a7 = tmp1 + tmp2;
292             tmp4 = C6 * (a4 + a6);
293             neg_b4 = Q * a4 + tmp4;
294             b6 = R * a6 - tmp4;
295             b2 = a2 * C4;
296             b5 = a5 * C4;
297             tmp3 = b6 - a7;
298             n0 = tmp3 - b5;
299             n1 = matrix[i] - matrix[32 + i];
300             n2 = b2 - a3;
301             n3 = matrix[i] + matrix[32 + i];
302             neg_n5 = neg_b4;
303             m3 = n1 + n2;
304             m4 = n3 + a3;
305             m5 = n1 - n2;
306             m6 = n3 - a3;
307             neg_m7 = neg_n5 + n0;
308             matrix[i] = m4 + a7;
309             matrix[8 + i] = m3 + tmp3;
310             matrix[16 + i] = m5 - n0;
311             matrix[24 + i] = m6 + neg_m7;
312             matrix[32 + i] = m6 - neg_m7;
313             matrix[40 + i] = m5 + n0;
314             matrix[48 + i] = m3 - tmp3;
315             matrix[56 + i] = m4 - a7;
316         }
317     }
318 
319     public static void scaleDequantizationMatrix(final float[] matrix) {
320         for (int y = 0; y < 8; y++) {
321             for (int x = 0; x < 8; x++) {
322                 matrix[8 * y + x] *= IDCT_SCALING_FACTORS[y] * IDCT_SCALING_FACTORS[x];
323             }
324         }
325     }
326 
327     public static void scaleDequantizationVector(final float[] vector) {
328         for (int x = 0; x < 8; x++) {
329             vector[x] *= IDCT_SCALING_FACTORS[x];
330         }
331     }
332 
333     public static void scaleQuantizationMatrix(final float[] matrix) {
334         for (int y = 0; y < 8; y++) {
335             for (int x = 0; x < 8; x++) {
336                 matrix[8 * y + x] *= DCT_SCALING_FACTORS[y] * DCT_SCALING_FACTORS[x];
337             }
338         }
339     }
340 
341     public static void scaleQuantizationVector(final float[] vector) {
342         for (int x = 0; x < 8; x++) {
343             vector[x] *= DCT_SCALING_FACTORS[x];
344         }
345     }
346 
347     private Dct() {
348     }
349 }