MythTV  master
yuv2rgb.cpp
Go to the documentation of this file.
1 /*
2  * yuv2rgb_mmx.c
3  * Copyright (C) 2000-2001 Silicon Integrated System Corp.
4  * All Rights Reserved.
5  *
6  * Author: Olie Lho <ollie@sis.com.tw>
7  *
8  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
9  * See http://libmpeg2.sourceforge.net/ for updates.
10  *
11  * mpeg2dec is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2 of the License, or
14  * (at your option) any later version.
15  *
16  * mpeg2dec is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24  */
25 
26 #include <algorithm>
27 #include <cinttypes>
28 #include <climits>
29 #include <cmath>
30 #include <cstdio>
31 #include <cstdlib>
32 #include "mythconfig.h"
33 
34 #if HAVE_MMX
35 extern "C" {
36 #include "ffmpeg-mmx.h"
37 }
38 #define CPU_MMXEXT 0
39 #define CPU_MMX 1
40 #endif
41 
42 #if HAVE_ALTIVEC
43 extern "C" {
44 #include "libavutil/cpu.h"
45 }
46 int has_altivec(void);
47 #if HAVE_ALTIVEC_H
48 #include <altivec.h>
49 #else
50 #include <Accelerate/Accelerate.h>
51 #endif
52 #endif
53 #include "yuv2rgb.h"
54 
55 #if HAVE_ALTIVEC
56 int has_altivec(void)
57 {
58  int cpu_flags = av_get_cpu_flags();
59  if (cpu_flags & AV_CPU_FLAG_ALTIVEC)
60  return(1);
61 
62  return(0);
63 }
64 #endif
65 
71 static void yuv420_argb32_non_mmx(unsigned char *image, unsigned char *py,
72  unsigned char *pu, unsigned char *pv,
73  int h_size, int v_size, int rgb_stride,
74  int y_stride, int uv_stride, int alphaones);
75 
76 /* CPU_MMXEXT/CPU_MMX adaptation layer */
77 
78 #define movntq(src,dest) \
79 do { \
80  if (cpu == CPU_MMXEXT) \
81  movntq_r2m (src, dest); \
82  else \
83  movq_r2m (src, dest); \
84 } while (0)
85 
86 #if HAVE_MMX
87 static inline void mmx_yuv2rgb (uint8_t * py, uint8_t * pu, uint8_t * pv)
88 {
89  static mmx_t mmx_80w = {0x0080008000800080ULL};
90  static mmx_t mmx_U_green = {0xf37df37df37df37dULL};
91  static mmx_t mmx_U_blue = {0x4093409340934093ULL};
92  static mmx_t mmx_V_red = {0x3312331233123312ULL};
93  static mmx_t mmx_V_green = {0xe5fce5fce5fce5fcULL};
94  static mmx_t mmx_10w = {0x1010101010101010ULL};
95  static mmx_t mmx_00ffw = {0x00ff00ff00ff00ffULL};
96  static mmx_t mmx_Y_coeff = {0x253f253f253f253fULL};
97 
98  movd_m2r (*pu, mm0); // mm0 = 00 00 00 00 u3 u2 u1 u0
99  movd_m2r (*pv, mm1); // mm1 = 00 00 00 00 v3 v2 v1 v0
100  movq_m2r (*py, mm6); // mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0
101  pxor_r2r (mm4, mm4); // mm4 = 0
102  /* XXX might do cache preload for image here */
103 
104  /*
105  * Do the multiply part of the conversion for even and odd pixels
106  * register usage:
107  * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels
108  * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels
109  * mm6 -> Y even, mm7 -> Y odd
110  */
111 
112  punpcklbw_r2r (mm4, mm0); // mm0 = u3 u2 u1 u0
113  punpcklbw_r2r (mm4, mm1); // mm1 = v3 v2 v1 v0
114  psubsw_m2r (mmx_80w, mm0); // u -= 128
115  psubsw_m2r (mmx_80w, mm1); // v -= 128
116  psllw_i2r (3, mm0); // promote precision
117  psllw_i2r (3, mm1); // promote precision
118  movq_r2r (mm0, mm2); // mm2 = u3 u2 u1 u0
119  movq_r2r (mm1, mm3); // mm3 = v3 v2 v1 v0
120  pmulhw_m2r (mmx_U_green, mm2); // mm2 = u * u_green
121  pmulhw_m2r (mmx_V_green, mm3); // mm3 = v * v_green
122  pmulhw_m2r (mmx_U_blue, mm0); // mm0 = chroma_b
123  pmulhw_m2r (mmx_V_red, mm1); // mm1 = chroma_r
124  paddsw_r2r (mm3, mm2); // mm2 = chroma_g
125 
126  psubusb_m2r (mmx_10w, mm6); // Y -= 16
127  movq_r2r (mm6, mm7); // mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0
128  pand_m2r (mmx_00ffw, mm6); // mm6 = Y6 Y4 Y2 Y0
129  psrlw_i2r (8, mm7); // mm7 = Y7 Y5 Y3 Y1
130  psllw_i2r (3, mm6); // promote precision
131  psllw_i2r (3, mm7); // promote precision
132  pmulhw_m2r (mmx_Y_coeff, mm6); // mm6 = luma_rgb even
133  pmulhw_m2r (mmx_Y_coeff, mm7); // mm7 = luma_rgb odd
134 
135  /*
136  * Do the addition part of the conversion for even and odd pixels
137  * register usage:
138  * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels
139  * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels
140  * mm6 -> Y even, mm7 -> Y odd
141  */
142 
143  movq_r2r (mm0, mm3); // mm3 = chroma_b
144  movq_r2r (mm1, mm4); // mm4 = chroma_r
145  movq_r2r (mm2, mm5); // mm5 = chroma_g
146  paddsw_r2r (mm6, mm0); // mm0 = B6 B4 B2 B0
147  paddsw_r2r (mm7, mm3); // mm3 = B7 B5 B3 B1
148  paddsw_r2r (mm6, mm1); // mm1 = R6 R4 R2 R0
149  paddsw_r2r (mm7, mm4); // mm4 = R7 R5 R3 R1
150  paddsw_r2r (mm6, mm2); // mm2 = G6 G4 G2 G0
151  paddsw_r2r (mm7, mm5); // mm5 = G7 G5 G3 G1
152  packuswb_r2r (mm0, mm0); // saturate to 0-255
153  packuswb_r2r (mm1, mm1); // saturate to 0-255
154  packuswb_r2r (mm2, mm2); // saturate to 0-255
155  packuswb_r2r (mm3, mm3); // saturate to 0-255
156  packuswb_r2r (mm4, mm4); // saturate to 0-255
157  packuswb_r2r (mm5, mm5); // saturate to 0-255
158  punpcklbw_r2r (mm3, mm0); // mm0 = B7 B6 B5 B4 B3 B2 B1 B0
159  punpcklbw_r2r (mm4, mm1); // mm1 = R7 R6 R5 R4 R3 R2 R1 R0
160  punpcklbw_r2r (mm5, mm2); // mm2 = G7 G6 G5 G4 G3 G2 G1 G0
161 }
162 
163 static inline void mmx_unpack_16rgb (uint8_t * image, int cpu)
164 {
165  static mmx_t mmx_bluemask = {0xf8f8f8f8f8f8f8f8LL};
166  static mmx_t mmx_greenmask = {0xfcfcfcfcfcfcfcfcLL};
167  static mmx_t mmx_redmask = {0xf8f8f8f8f8f8f8f8LL};
168 
169  /*
170  * convert RGB plane to RGB 16 bits
171  * mm0 -> B, mm1 -> R, mm2 -> G
172  * mm4 -> GB, mm5 -> AR pixel 4-7
173  * mm6 -> GB, mm7 -> AR pixel 0-3
174  */
175 
176  pand_m2r (mmx_bluemask, mm0); // mm0 = b7b6b5b4b3______
177  pand_m2r (mmx_greenmask, mm2); // mm2 = g7g6g5g4g3g2____
178  pand_m2r (mmx_redmask, mm1); // mm1 = r7r6r5r4r3______
179  psrlq_i2r (3, mm0); // mm0 = ______b7b6b5b4b3
180  pxor_r2r (mm4, mm4); // mm4 = 0
181  movq_r2r (mm0, mm5); // mm5 = ______b7b6b5b4b3
182  movq_r2r (mm2, mm7); // mm7 = g7g6g5g4g3g2____
183 
184  punpcklbw_r2r (mm4, mm2);
185  punpcklbw_r2r (mm1, mm0);
186  psllq_i2r (3, mm2);
187  por_r2r (mm2, mm0);
188  movntq (mm0, *image);
189 
190  punpckhbw_r2r (mm4, mm7);
191  punpckhbw_r2r (mm1, mm5);
192  psllq_i2r (3, mm7);
193  por_r2r (mm7, mm5);
194  movntq (mm5, *(image+8));
195 }
196 
197 static inline void mmx_unpack_32rgb (uint8_t * image, int cpu, int alphaones)
198 {
199  /*
200  * convert RGB plane to RGB packed format,
201  * mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
202  * mm4 -> GB, mm5 -> AR pixel 4-7,
203  * mm6 -> GB, mm7 -> AR pixel 0-3
204  */
205 
206  if (alphaones)
207  {
208  static mmx_t mmx_1s = {0xffffffffffffffffLL};
209  movq_m2r (mmx_1s, mm3);
210  }
211  else
212  pxor_r2r (mm3, mm3);
213 
214  movq_r2r (mm0, mm6);
215  movq_r2r (mm1, mm7);
216  movq_r2r (mm0, mm4);
217  movq_r2r (mm1, mm5);
218  punpcklbw_r2r (mm2, mm6);
219  punpcklbw_r2r (mm3, mm7);
220  punpcklwd_r2r (mm7, mm6);
221  movntq (mm6, *image);
222  movq_r2r (mm0, mm6);
223  punpcklbw_r2r (mm2, mm6);
224  punpckhwd_r2r (mm7, mm6);
225  movntq (mm6, *(image+8));
226  punpckhbw_r2r (mm2, mm4);
227  punpckhbw_r2r (mm3, mm5);
228  punpcklwd_r2r (mm5, mm4);
229  movntq (mm4, *(image+16));
230  movq_r2r (mm0, mm4);
231  punpckhbw_r2r (mm2, mm4);
232  punpckhwd_r2r (mm5, mm4);
233  movntq (mm4, *(image+24));
234 }
235 
236 static inline void yuv420_rgb16 (uint8_t * image,
237  uint8_t * py, uint8_t * pu, uint8_t * pv,
238  int width, int height,
239  int rgb_stride, int y_stride, int uv_stride,
240  int cpu, int alphaones)
241 {
242  (void)alphaones;
243  int i;
244 
245  rgb_stride -= 2 * width;
246  y_stride -= width;
247  uv_stride -= width >> 1;
248  width >>= 3;
249 
250  do {
251  i = width;
252  do {
253  mmx_yuv2rgb (py, pu, pv);
254  mmx_unpack_16rgb (image, cpu);
255  py += 8;
256  pu += 4;
257  pv += 4;
258  image += 16;
259  } while (--i);
260 
261  py += y_stride;
262  image += rgb_stride;
263  if (height & 1) {
264  pu += uv_stride;
265  pv += uv_stride;
266  } else {
267  pu -= 4 * width;
268  pv -= 4 * width;
269  }
270  } while (--height);
271 
272  emms();
273 }
274 
275 static inline void yuv420_argb32 (uint8_t * image, uint8_t * py,
276  uint8_t * pu, uint8_t * pv,
277  int width, int height,
278  int rgb_stride, int y_stride, int uv_stride,
279  int cpu, int alphaones)
280 {
281  int i;
282 
283  rgb_stride -= 4 * width;
284  y_stride -= width;
285  uv_stride -= width >> 1;
286  width >>= 3;
287 
288  do {
289  i = width;
290  do {
291  mmx_yuv2rgb (py, pu, pv);
292  mmx_unpack_32rgb (image, cpu, alphaones);
293  py += 8;
294  pu += 4;
295  pv += 4;
296  image += 32;
297  } while (--i);
298 
299  py += y_stride;
300  image += rgb_stride;
301  if (height & 1) {
302  pu += uv_stride;
303  pv += uv_stride;
304  } else {
305  pu -= 4 * width;
306  pv -= 4 * width;
307  }
308  } while (--height);
309 
310  emms();
311 }
312 
313 static void mmxext_rgb16 (uint8_t * image,
314  uint8_t * py, uint8_t * pu, uint8_t * pv,
315  int width, int height,
316  int rgb_stride, int y_stride, int uv_stride,
317  int alphaones)
318 {
319  yuv420_rgb16 (image, py, pu, pv, width, height,
320  rgb_stride, y_stride, uv_stride, CPU_MMXEXT, alphaones);
321 }
322 
323 static void mmxext_argb32 (uint8_t * image,
324  uint8_t * py, uint8_t * pu, uint8_t * pv,
325  int width, int height,
326  int rgb_stride, int y_stride, int uv_stride,
327  int alphaones)
328 {
329  yuv420_argb32 (image, py, pu, pv, width, height,
330  rgb_stride, y_stride, uv_stride, CPU_MMXEXT, alphaones);
331 }
332 
333 static void mmx_rgb16 (uint8_t * image,
334  uint8_t * py, uint8_t * pu, uint8_t * pv,
335  int width, int height,
336  int rgb_stride, int y_stride, int uv_stride,
337  int alphaones)
338 {
339  yuv420_rgb16 (image, py, pu, pv, width, height,
340  rgb_stride, y_stride, uv_stride, CPU_MMX, alphaones);
341 }
342 
343 static void mmx_argb32 (uint8_t * image,
344  uint8_t * py, uint8_t * pu, uint8_t * pv,
345  int width, int height,
346  int rgb_stride, int y_stride, int uv_stride,
347  int alphaones)
348 {
349  yuv420_argb32 (image, py, pu, pv, width, height,
350  rgb_stride, y_stride, uv_stride, CPU_MMX, alphaones);
351 }
352 #endif
353 
363 yuv2rgb_fun yuv2rgb_init_mmxext (int bpp, int mode)
364 {
365 #if HAVE_MMX
366  if ((bpp == 16) && (mode == MODE_RGB))
367  return mmxext_rgb16;
368  else if ((bpp == 32) && (mode == MODE_RGB))
369  return mmxext_argb32;
370 #endif
371 
372  (void)bpp;
373  (void)mode;
374 
375  return nullptr; /* Fallback to C */
376 }
377 
387 yuv2rgb_fun yuv2rgb_init_mmx (int bpp, int mode)
388 {
389 #if HAVE_MMX
390  if ((bpp == 16) && (mode == MODE_RGB))
391  return mmx_rgb16;
392  else if ((bpp == 32) && (mode == MODE_RGB))
393  return mmx_argb32;
394 #endif
395  if ((bpp == 32) && (mode == MODE_RGB))
396  return yuv420_argb32_non_mmx;
397 
398  return nullptr;
399 }
400 
401 #define SCALE_BITS 10
402 
403 #define C_Y (76309 >> (16 - SCALE_BITS))
404 #define C_RV (117504 >> (16 - SCALE_BITS))
405 #define C_BU (138453 >> (16 - SCALE_BITS))
406 #define C_GU (13954 >> (16 - SCALE_BITS))
407 #define C_GV (34903 >> (16 - SCALE_BITS))
408 
409 #if defined(ANDROID)
410 #undef UCHAR_MAX
411 #define UCHAR_MAX 0xff
412 #endif
413 #if defined(__FreeBSD__)
414 // HACK: this is actually only needed on AMD64 at the moment,
415 // but is doesn't hurt the other architectures.
416 #undef UCHAR_MAX
417 #define UCHAR_MAX (int)__UCHAR_MAX
418 #endif
419 
420 #define RGBOUT(r, g, b, y1)\
421 {\
422  y = ((y1) - 16) * C_Y;\
423  (r) = std::min(UCHAR_MAX, std::max(0, (y + r_add) >> SCALE_BITS));\
424  (g) = std::min(UCHAR_MAX, std::max(0, (y + g_add) >> SCALE_BITS));\
425  (b) = std::min(UCHAR_MAX, std::max(0, (y + b_add) >> SCALE_BITS));\
426 }
427 
428 static void yuv420_argb32_non_mmx(unsigned char *image, unsigned char *py,
429  unsigned char *pu, unsigned char *pv,
430  int h_size, int v_size, int rgb_stride,
431  int y_stride, int uv_stride, int alphaones)
432 {
433  unsigned char *y1_ptr, *y2_ptr, *cb_ptr, *cr_ptr, *d, *d1, *d2;
434  int w, y, cb, cr, r_add, g_add, b_add, width2;
435  int dstwidth;
436 
437 // byte indices
438 #if HAVE_BIGENDIAN
439 #define R_OI 1
440 #define G_OI 2
441 #define B_OI 3
442 #define A_OI 0
443 #else
444 #define R_OI 2
445 #define G_OI 1
446 #define B_OI 0
447 #define A_OI 3
448 #endif
449 
450  // squelch a warning
451  (void) rgb_stride; (void) y_stride; (void) uv_stride;
452 
453  d = image;
454  y1_ptr = py;
455  cb_ptr = pu;
456  cr_ptr = pv;
457  dstwidth = h_size * 4;
458  width2 = h_size / 2;
459 
460  for(;v_size > 0; v_size -= 2) {
461  d1 = d;
462  d2 = d + h_size * 4;
463  y2_ptr = y1_ptr + h_size;
464  for(w = width2; w > 0; w--) {
465  cb = cb_ptr[0] - 128;
466  cr = cr_ptr[0] - 128;
467  r_add = C_RV * cr + (1 << (SCALE_BITS - 1));
468  g_add = - C_GU * cb - C_GV * cr + (1 << (SCALE_BITS - 1));
469  b_add = C_BU * cb + (1 << (SCALE_BITS - 1));
470 
471  /* output 4 pixels */
472  RGBOUT(d1[R_OI], d1[G_OI], d1[B_OI], y1_ptr[0]);
473  RGBOUT(d1[R_OI+4], d1[G_OI+4], d1[B_OI+4], y1_ptr[1]);
474  RGBOUT(d2[R_OI], d2[G_OI], d2[B_OI], y2_ptr[0]);
475  RGBOUT(d2[R_OI+4], d2[G_OI+4], d2[B_OI+4], y2_ptr[1]);
476 
477  if (alphaones)
478  d1[A_OI] = d1[A_OI+4] = d2[A_OI] = d2[A_OI+4] = 0xff;
479  else
480  d1[A_OI] = d1[A_OI+4] = d2[A_OI] = d2[A_OI+4] = 0;
481 
482  d1 += 8;
483  d2 += 8;
484  y1_ptr += 2;
485  y2_ptr += 2;
486  cb_ptr++;
487  cr_ptr++;
488  }
489  d += 2 * dstwidth;
490  y1_ptr += h_size;
491  }
492 }
493 
494 #define SCALEBITS 8
495 #define ONE_HALF (1 << (SCALEBITS - 1))
496 #define FIX(x) (lroundf((x) * (1L<<SCALEBITS)))
497 
502 void rgb32_to_yuv420p(unsigned char *lum, unsigned char *cb, unsigned char *cr,
503  unsigned char *alpha, unsigned char *src,
504  int width, int height, int srcwidth)
505 {
506  int wrap, wrap4, x, y;
507  int r, g, b, r1, g1, b1;
508  unsigned char *p;
509 
510 // byte indices
511 #if HAVE_BIGENDIAN
512 #define R_II 3
513 #define G_II 2
514 #define B_II 1
515 #define A_II 0
516 #else
517 #define R_II 0
518 #define G_II 1
519 #define B_II 2
520 #define A_II 3
521 #endif
522 
523  wrap = (width + 1) & ~1;
524  wrap4 = srcwidth * 4;
525  p = src;
526  for(y=0;y+1<height;y+=2) {
527  for(x=0;x+1<width;x+=2) {
528  r = p[R_II];
529  g = p[G_II];
530  b = p[B_II];
531  r1 = r;
532  g1 = g;
533  b1 = b;
534  lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
535  FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
536  alpha[0] = p[A_II];
537 
538  r = p[R_II+4];
539  g = p[G_II+4];
540  b = p[B_II+4];
541  r1 += r;
542  g1 += g;
543  b1 += b;
544  lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g +
545  FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
546  alpha[1] = p[A_II+4];
547 
548  p += wrap4;
549  lum += wrap;
550  alpha += wrap;
551 
552  r = p[R_II];
553  g = p[G_II];
554  b = p[B_II];
555  r1 += r;
556  g1 += g;
557  b1 += b;
558  lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
559  FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
560  alpha[0] = p[A_II];
561 
562  r = p[R_II+4];
563  g = p[G_II+4];
564  b = p[B_II+4];
565  r1 += r;
566  g1 += g;
567  b1 += b;
568  lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g +
569  FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
570  alpha[1] = p[A_II+4];
571 
572  cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
573  FIX(0.50000) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) +
574  128;
575  cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
576  FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) +
577  128;
578 
579  cb++;
580  cr++;
581  p += -wrap4 + 2 * 4;
582  lum += -wrap + 2;
583  alpha += -wrap + 2;
584  }
585  if (width & 1) {
586  r = p[R_II];
587  g = p[G_II];
588  b = p[B_II];
589  r1 = r;
590  g1 = g;
591  b1 = b;
592  lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
593  FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
594  alpha[0] = p[A_II];
595 
596  lum[1] = 16;
597  alpha[1] = 0;
598 
599  p += wrap4;
600  lum += wrap;
601  alpha += wrap;
602 
603  r = p[R_II];
604  g = p[G_II];
605  b = p[B_II];
606  r1 += r;
607  g1 += g;
608  b1 += b;
609  lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
610  FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
611  alpha[0] = p[A_II];
612 
613  lum[1] = 16;
614  alpha[1] = 0;
615 
616  cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
617  FIX(0.50000) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
618  128;
619  cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
620  FIX(0.08131) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
621  128;
622 
623  cb++;
624  cr++;
625  p += -wrap4 + 4;
626  lum += -wrap + 2;
627  alpha += -wrap + 2;
628  }
629  p += wrap4 * 2 - width * 4;
630  lum += wrap;
631  alpha += wrap;
632  }
633  if (height & 1) {
634  for(x=0;x+1<width;x+=2) {
635  r = p[R_II];
636  g = p[G_II];
637  b = p[B_II];
638  r1 = r;
639  g1 = g;
640  b1 = b;
641  lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
642  FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
643  alpha[0] = p[A_II];
644 
645  r = p[R_II+4];
646  g = p[G_II+4];
647  b = p[B_II+4];
648  r1 += r;
649  g1 += g;
650  b1 += b;
651  lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g +
652  FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
653  alpha[1] = p[A_II+4];
654 
655  lum += wrap;
656  alpha += wrap;
657 
658  lum[0] = 16;
659  alpha[0] = 0;
660 
661  lum[1] = 16;
662  alpha[1] = 0;
663 
664  cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
665  FIX(0.50000) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
666  128;
667  cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
668  FIX(0.08131) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
669  128;
670 
671  cb++;
672  cr++;
673  p += 2 * 4;
674  lum += -wrap + 2;
675  alpha += -wrap + 2;
676  }
677  if (width & 1) {
678  r = p[R_II];
679  g = p[G_II];
680  b = p[B_II];
681  r1 = r;
682  g1 = g;
683  b1 = b;
684  lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
685  FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
686  alpha[0] = p[A_II];
687 
688  lum[1] = 16;
689  alpha[1] = 0;
690 
691  lum += wrap;
692  alpha += wrap;
693 
694  lum[0] = 16;
695  alpha[0] = 0;
696 
697  lum[1] = 16;
698  alpha[1] = 0;
699 
700  cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
701  FIX(0.50000) * b1 + ONE_HALF - 1) >> SCALEBITS) +
702  128;
703  cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
704  FIX(0.08131) * b1 + ONE_HALF - 1) >> SCALEBITS) +
705  128;
706 
707 #if 0 // no point in updating after the last pixel
708  cb++;
709  cr++;
710  p += 4;
711  lum += -wrap + 2;
712  alpha += -wrap + 2;
713 #endif
714  }
715  }
716 }
717 
718 /* I420 to 2VUY colorspace conversion routines.
719  *
720  * In the early days of the OS X port of MythTV, Paul Jara noticed that
721  * QuickTime spent a lot of time converting from YUV420 to YUV422.
722  * He found some sample code on the Ars Technica forum by a
723  * Frenchman called Titer which used Altivec to speed this up.
724  * Jeremiah Morris took that code and added it into MythTV.
725  *
726  * All was well until the Intel Macs came along,
727  * which seem to crash when fed YUV420 from MythTV.
728  *
729  * Fortunately, Mino Taoyama has provided an MMX optimised version too.
730  */
731 
744 static void non_vec_i420_2vuy(
745  uint8_t *image, int vuy_stride,
746  const uint8_t *py, const uint8_t *pu, const uint8_t *pv,
747  int y_stride, int u_stride, int v_stride,
748  int h_size, int v_size)
749 {
750  uint8_t *pi1, *pi2;
751  const uint8_t *py1;
752  const uint8_t *py2;
753  const uint8_t *pu1;
754  const uint8_t *pv1;
755  int x, y;
756 
757  for (y = 0; y < (v_size>>1); y++)
758  {
759  pi1 = image + 2*y * vuy_stride;
760  pi2 = image + 2*y * vuy_stride + vuy_stride;
761  py1 = py + 2*y * y_stride;
762  py2 = py + 2*y * y_stride + y_stride;
763  pu1 = pu + y * u_stride;
764  pv1 = pv + y * v_stride;
765 
766  for (x = 0; x < (h_size>>1); x++)
767  {
768  pi1[4*x+0] = pu1[1*x+0];
769  pi2[4*x+0] = pu1[1*x+0];
770  pi1[4*x+1] = py1[2*x+0];
771  pi2[4*x+1] = py2[2*x+0];
772  pi1[4*x+2] = pv1[1*x+0];
773  pi2[4*x+2] = pv1[1*x+0];
774  pi1[4*x+3] = py1[2*x+1];
775  pi2[4*x+3] = py2[2*x+1];
776  }
777  }
778 }
779 
780 #if HAVE_MMX
781 
793 static void mmx_i420_2vuy(
794  uint8_t *image, int vuy_stride,
795  const uint8_t *py, const uint8_t *pu, const uint8_t *pv,
796  int y_stride, int u_stride, int v_stride,
797  int h_size, int v_size)
798 {
799  uint8_t *pi1, *pi2;
800  const uint8_t *py1 = py;
801  const uint8_t *py2 = py;
802  const uint8_t *pu1 = pu;
803  const uint8_t *pv1 = pv;
804 
805  int x,y;
806 
807  if ((h_size % 16) || (v_size % 2))
808  {
809  non_vec_i420_2vuy(image, vuy_stride,
810  py, pu, pv, y_stride, u_stride, v_stride,
811  h_size, v_size);
812  return;
813  }
814 
815  emms();
816 
817  for (y = 0; y < (v_size>>1); y++)
818  {
819  pi1 = image + 2*y * vuy_stride;
820  pi2 = image + 2*y * vuy_stride + vuy_stride;
821  py1 = py + 2*y * y_stride;
822  py2 = py + 2*y * y_stride + y_stride;
823  pu1 = pu + y * u_stride;
824  pv1 = pv + y * v_stride;
825 
826  for (x = 0; x < h_size / 16; x++)
827  {
828  movq_m2r (*py1, mm0); // y data
829  movq_m2r (*py2, mm1); // y data
830  movq_m2r (*pu1, mm2); // u data
831  movq_m2r (*pv1, mm3); // v data
832 
833  movq_r2r (mm2, mm4); // Copy U
834 
835  punpcklbw_r2r (mm3, mm2); // Combine low U & V mm2 = uv low
836  punpckhbw_r2r (mm3, mm4); // Combine high U & V mm4 = uv high
837 
838  movq_r2r (mm2, mm5); // Copy low UV mm5 = uv low
839  movq_r2r (mm2, mm6); // Copy low UV mm6 = uv low
840  punpcklbw_r2r (mm0, mm5); // mm5 = y1 low uv low
841  punpckhbw_r2r (mm0, mm6); // mm6 = y1 high uv high
842 
843  movntq_r2m (mm5, *(pi1));
844  movntq_r2m (mm6, *(pi1+8));
845 
846  movq_r2r (mm2, mm5); // Copy low UV mm5 = uv low
847  movq_r2r (mm2, mm6); // Copy low UV mm6 = uv low
848  punpcklbw_r2r (mm1, mm5); // mm5 = y2 low uv low
849  punpckhbw_r2r (mm1, mm6); // mm6 = y2 high uv high
850 
851  movntq_r2m (mm5, *(pi2));
852  movntq_r2m (mm6, *(pi2+8));
853 
854 
855  movq_m2r (*(py1+8), mm0); // y data
856  movq_m2r (*(py2+8), mm1); // y data
857 
858  movq_r2r (mm4, mm5); // Copy high UV mm5 = uv high
859  movq_r2r (mm4, mm6); // Copy high UV mm6 = uv high
860  punpcklbw_r2r (mm0, mm5); // mm5 = y1 low uv high
861  punpckhbw_r2r (mm0, mm6); // mm6 = y1 high uv high
862 
863  movntq_r2m (mm5, *(pi1+16));
864  movntq_r2m (mm6, *(pi1+24));
865 
866  movq_r2r (mm4, mm5); // Copy high UV mm5 = uv high
867  movq_r2r (mm4, mm6); // Copy high UV mm6 = uv high
868  punpcklbw_r2r (mm1, mm5); // mm5 = y2 low uv low
869  punpckhbw_r2r (mm1, mm6); // mm6 = y2 high uv high
870 
871  movntq_r2m (mm5, *(pi2+16));
872  movntq_r2m (mm6, *(pi2+24));
873 
874  pi1 += 32;
875  pi2 += 32;
876  py1 += 16;
877  py2 += 16;
878  pu1 += 8;
879  pv1 += 8;
880  }
881  }
882 
883  emms();
884 }
885 
886 #endif // HAVE_MMX
887 
888 #if HAVE_ALTIVEC
889 
890 // Altivec code adapted from VLC's i420_yuv2.c (thanks to Titer and Paul Jara)
891 
892 #define VEC_NEXT_LINES() \
893  pi1 = pi2; \
894  pi2 += h_size * 2; \
895  py1 = py2; \
896  py2 += h_size;
897 
898 #define VEC_LOAD_UV() \
899  u_vec = vec_ld(0, pu); pu += 16; \
900  v_vec = vec_ld(0, pv); pv += 16;
901 
902 #define VEC_MERGE(a) \
903  uv_vec = a(u_vec, v_vec); \
904  y_vec = vec_ld(0, py1); py1 += 16; \
905  vec_st(vec_mergeh(uv_vec, y_vec), 0, pi1); pi1 += 16; \
906  vec_st(vec_mergel(uv_vec, y_vec), 0, pi1); pi1 += 16; \
907  y_vec = vec_ld(0, py2); py2 += 16; \
908  vec_st(vec_mergeh(uv_vec, y_vec), 0, pi2); pi2 += 16; \
909  vec_st(vec_mergel(uv_vec, y_vec), 0, pi2); pi2 += 16;
910 
923 static void altivec_i420_2vuy(
924  uint8_t *image, int vuy_stride,
925  const uint8_t *py, const uint8_t *pu, const uint8_t *pv,
926  int y_stride, int u_stride, int v_stride,
927  int h_size, int v_size)
928 {
929  uint8_t *pi1, *pi2 = image;
930  const uint8_t *py1;
931  const uint8_t *py2 = py;
932 
933  int x, y;
934 
935  vector unsigned char u_vec;
936  vector unsigned char v_vec;
937  vector unsigned char uv_vec;
938  vector unsigned char y_vec;
939 
940  int vuy_extra = vuy_stride - (h_size<<1);
941  int y_extra = y_stride - (h_size);
942  int u_extra = u_stride - (h_size>>1);
943  int v_extra = v_stride - (h_size>>1);
944 
945  if (vuy_extra || y_extra || u_extra || v_extra)
946  {
947  // Fall back to C version
948  non_vec_i420_2vuy(image, vuy_stride,
949  py, pu, pv,
950  y_stride, u_stride, v_stride,
951  h_size, v_size);
952  return;
953  }
954 
955  if (!((h_size % 32) || (v_size % 2)))
956  {
957  // Width is a multiple of 32, process 2 lines at a time
958  for (y = v_size / 2; y--; )
959  {
960  VEC_NEXT_LINES();
961  for (x = h_size / 32; x--; )
962  {
963  VEC_LOAD_UV();
964  VEC_MERGE(vec_mergeh);
965  VEC_MERGE(vec_mergel);
966  }
967  }
968 
969  }
970  else if (!((h_size % 16) || (v_size % 4)))
971  {
972  // Width is a multiple of 16, process 4 lines at a time
973  for (y = v_size / 4; y--; )
974  {
975  // Lines 1-2, pixels 0 to (width - 16)
976  VEC_NEXT_LINES();
977  for (x = h_size / 32; x--; )
978  {
979  VEC_LOAD_UV();
980  VEC_MERGE(vec_mergeh);
981  VEC_MERGE(vec_mergel);
982  }
983 
984  // Lines 1-2, pixels (width - 16) to width
985  VEC_LOAD_UV();
986  VEC_MERGE(vec_mergeh);
987 
988  // Lines 3-4, pixels 0-16
989  VEC_NEXT_LINES();
990  VEC_MERGE(vec_mergel);
991 
992  // Lines 3-4, pixels 16 to width
993  for (x = h_size / 32; x--; )
994  {
995  VEC_LOAD_UV();
996  VEC_MERGE(vec_mergeh);
997  VEC_MERGE(vec_mergel);
998  }
999  }
1000  }
1001  else
1002  {
1003  // Fall back to C version
1004  non_vec_i420_2vuy(image, vuy_stride,
1005  py, pu, pv,
1006  y_stride, u_stride, v_stride,
1007  h_size, v_size);
1008  }
1009 }
1010 
1011 #endif // HAVE_ALTIVEC
1012 
1013 
1028 {
1029 #if HAVE_ALTIVEC
1030  if (has_altivec())
1031  return altivec_i420_2vuy;
1032 #endif
1033 #if HAVE_MMX
1034  return mmx_i420_2vuy;
1035 #else
1036  return non_vec_i420_2vuy; /* Fallback to C */
1037 #endif
1038 }
1039 
1049 static void non_vec_2vuy_i420(
1050  uint8_t *py, uint8_t *pu, uint8_t *pv,
1051  int y_stride, int u_stride, int v_stride,
1052  const uint8_t *image, int vuy_stride,
1053  int h_size, int v_size)
1054 {
1055  const uint8_t *pi1;
1056  const uint8_t *pi2;
1057  uint8_t *py1, *py2, *pu1, *pv1;
1058  int x, y;
1059 
1060  for (y = 0; y < (v_size>>1); y++)
1061  {
1062  pi1 = image + 2*y * vuy_stride;
1063  pi2 = image + 2*y * vuy_stride + vuy_stride;
1064  py1 = py + 2*y * y_stride;
1065  py2 = py + 2*y * y_stride + y_stride;
1066  pu1 = pu + y * u_stride;
1067  pv1 = pv + y * v_stride;
1068 
1069  for (x = 0; x < (h_size>>1); x++)
1070  {
1071  pu1[1*x+0] = (pi1[4*x+0] + pi2[4*x+0]) >> 1;
1072  py1[2*x+0] = pi1[4*x+1];
1073  py2[2*x+0] = pi2[4*x+1];
1074  pv1[1*x+0] = (pi1[4*x+2] + pi2[4*x+2]) >> 1;
1075  py1[2*x+1] = pi1[4*x+3];
1076  py2[2*x+1] = pi2[4*x+3];
1077  }
1078  }
1079 }
1080 
1081 #if HAVE_ALTIVEC
1082 
1083 // Altivec code adapted from VLC's i420_yuv2.c (thanks to Titer and Paul Jara)
1084 
1085 #define VEC_READ_LINE(ptr, y, uv) \
1086  pa_vec = vec_ld(0, ptr); ptr += 16; \
1087  pb_vec = vec_ld(0, ptr); ptr += 16; \
1088  vec_st(vec_pack((vector unsigned short)pa_vec, \
1089  (vector unsigned short)pb_vec), \
1090  0, y); y += 16; \
1091  uv = vec_pack(vec_sr((vector unsigned short)pa_vec, eight_vec), \
1092  vec_sr((vector unsigned short)pb_vec, eight_vec));
1093 
1094 #define VEC_SPLIT(a) \
1095  VEC_READ_LINE(pi1, py1, uv1_vec); \
1096  VEC_READ_LINE(pi2, py2, uv2_vec); \
1097  a = vec_avg(uv1_vec, uv2_vec);
1098 
1099 #define VEC_STORE_UV() \
1100  vec_st(vec_pack((vector unsigned short)uva_vec, \
1101  (vector unsigned short)uvb_vec), \
1102  0, pv); pv += 16; \
1103  vec_st(vec_pack(vec_sr((vector unsigned short)uva_vec, eight_vec), \
1104  vec_sr((vector unsigned short)uvb_vec, eight_vec)), \
1105  0, pu); pu += 16;
1106 
1107 
1117 static void altivec_2vuy_i420(
1118  uint8_t *py, uint8_t *pu, uint8_t *pv,
1119  int y_stride, int u_stride, int v_stride,
1120  const uint8_t *image, int vuy_stride,
1121  int h_size, int v_size)
1122 {
1123  const uint8_t *pi1;
1124  const uint8_t *pi2 = image;
1125  uint8_t *py1, *py2 = py;
1126 
1127  int x, y;
1128 
1129  vector unsigned short eight_vec = vec_splat_u16(8);
1130  vector unsigned char pa_vec, pb_vec,
1131  uv1_vec, uv2_vec,
1132  uva_vec, uvb_vec;
1133 
1134  int vuy_extra = vuy_stride - (h_size<<1);
1135  int y_extra = y_stride - (h_size);
1136  int u_extra = u_stride - (h_size>>1);
1137  int v_extra = v_stride - (h_size>>1);
1138 
1139  if (vuy_extra || y_extra || u_extra || v_extra)
1140  {
1141  // Fall back to C version
1142  non_vec_2vuy_i420(py, pu, pv,
1143  y_stride, u_stride, v_stride,
1144  image, vuy_stride,
1145  h_size, v_size);
1146  return;
1147  }
1148 
1149  if (!((h_size % 32) || (v_size % 2)))
1150  {
1151  // Width is a multiple of 32, process 2 lines at a time
1152  for (y = v_size / 2; y--; )
1153  {
1154  VEC_NEXT_LINES();
1155  for (x = h_size / 32; x--; )
1156  {
1157  VEC_SPLIT(uva_vec);
1158  VEC_SPLIT(uvb_vec);
1159  VEC_STORE_UV();
1160  }
1161  }
1162  }
1163  else if (!((h_size % 16) || (v_size % 4)))
1164  {
1165  // Width is a multiple of 16, process 4 lines at a time
1166  for (y = v_size / 4; y--; )
1167  {
1168  // Lines 1-2, pixels 0 to (width - 16)
1169  VEC_NEXT_LINES();
1170  for (x = h_size / 32; x--; )
1171  {
1172  VEC_SPLIT(uva_vec);
1173  VEC_SPLIT(uvb_vec);
1174  VEC_STORE_UV();
1175  }
1176 
1177  // Lines 1-2, pixels (width - 16) to width
1178  VEC_SPLIT(uva_vec);
1179 
1180  // Lines 3-4, pixels 0-16
1181  VEC_NEXT_LINES();
1182  VEC_SPLIT(uvb_vec);
1183  VEC_STORE_UV();
1184 
1185  // Lines 3-4, pixels 16 to width
1186  for (x = h_size / 32; x--; )
1187  {
1188  VEC_SPLIT(uva_vec);
1189  VEC_SPLIT(uvb_vec);
1190  VEC_STORE_UV();
1191  }
1192  }
1193  }
1194  else
1195  {
1196  // Fall back to C version
1197  non_vec_2vuy_i420(py, pu, pv,
1198  y_stride, u_stride, v_stride,
1199  image, vuy_stride,
1200  h_size, v_size);
1201  }
1202 }
1203 
1204 #endif // HAVE_ALTIVEC
1205 
1206 
1221 {
1222 #if HAVE_ALTIVEC
1223  if (has_altivec())
1224  return altivec_2vuy_i420;
1225 #endif
1226  return non_vec_2vuy_i420; /* Fallback to C */
1227 }
yuv2rgb_fun yuv2rgb_init_mmx(int bpp, int mode)
This returns a yuv to rgba converter, using mmx if MMX was compiled in.
Definition: yuv2rgb.cpp:387
#define paddsw_r2r(regs, regd)
Definition: ffmpeg-mmx.h:122
static void non_vec_i420_2vuy(uint8_t *image, int vuy_stride, const uint8_t *py, const uint8_t *pu, const uint8_t *pv, int y_stride, int u_stride, int v_stride, int h_size, int v_size)
Plain C I420 to 2VUY conversion function.
Definition: yuv2rgb.cpp:744
void(* yuv2rgb_fun)(uint8_t *image, uint8_t *py, uint8_t *pu, uint8_t *pv, int h_size, int v_size, int rgb_stride, int y_stride, int uv_stride, int alphaones)
Definition: yuv2rgb.h:32
static const mmx_t mmx_1s
Definition: util-opengl.cpp:16
#define movntq(src, dest)
Definition: yuv2rgb.cpp:78
#define G_II
#define movntq_r2m(mmreg, var)
Definition: ffmpeg-mmx.h:250
#define pmulhw_m2r(var, reg)
Definition: ffmpeg-mmx.h:153
yuv2rgb_fun yuv2rgb_init_mmxext(int bpp, int mode)
This returns a yuv to rgba converter, using mmxext if MMX was compiled in.
Definition: yuv2rgb.cpp:363
#define punpcklwd_r2r(regs, regd)
Definition: ffmpeg-mmx.h:218
#define punpckhbw_r2r(regs, regd)
Definition: ffmpeg-mmx.h:207
#define psubusb_m2r(var, reg)
Definition: ffmpeg-mmx.h:201
#define movq_r2r(regs, regd)
Definition: ffmpeg-mmx.h:101
unsigned char r
Definition: ParseText.cpp:340
#define A_OI
#define punpckhwd_r2r(regs, regd)
Definition: ffmpeg-mmx.h:211
#define punpcklbw_r2r(regs, regd)
Definition: ffmpeg-mmx.h:214
unsigned char b
Definition: ParseText.cpp:340
conv_i420_2vuy_fun get_i420_2vuy_conv(void)
Returns I420 to 2VUY conversion function.
Definition: yuv2rgb.cpp:1027
#define B_OI
#define pxor_r2r(regs, regd)
Definition: ffmpeg-mmx.h:221
#define por_r2r(regs, regd)
Definition: ffmpeg-mmx.h:160
#define pand_m2r(var, reg)
Definition: ffmpeg-mmx.h:129
#define A_II
static const uint16_t * d
#define R_II
#define movd_m2r(var, reg)
Definition: ffmpeg-mmx.h:88
#define C_GV
Definition: yuv2rgb.cpp:407
static void yuv420_argb32_non_mmx(unsigned char *image, unsigned char *py, unsigned char *pu, unsigned char *pv, int h_size, int v_size, int rgb_stride, int y_stride, int uv_stride, int alphaones)
Definition: yuv2rgb.cpp:428
#define psrlq_i2r(imm, reg)
Definition: ffmpeg-mmx.h:182
void(* conv_i420_2vuy_fun)(uint8_t *image, int vuy_stride, const uint8_t *py, const uint8_t *pu, const uint8_t *pv, int y_stride, int u_stride, int v_stride, int h_size, int v_size)
Definition: yuv2rgb.h:53
#define SCALE_BITS
Definition: yuv2rgb.cpp:401
#define B_II
#define MODE_RGB
Definition: yuv2rgb.h:29
#define ONE_HALF
Definition: yuv2rgb.cpp:495
#define C_GU
Definition: yuv2rgb.cpp:406
#define packuswb_r2r(regs, regd)
Definition: ffmpeg-mmx.h:110
#define psubsw_m2r(var, reg)
Definition: ffmpeg-mmx.h:198
#define emms()
Definition: mm_arch.h:15
conv_2vuy_i420_fun get_2vuy_i420_conv(void)
Returns 2VUY to I420 conversion function.
Definition: yuv2rgb.cpp:1220
#define RGBOUT(r, g, b, y1)
Definition: yuv2rgb.cpp:420
static void non_vec_2vuy_i420(uint8_t *py, uint8_t *pu, uint8_t *pv, int y_stride, int u_stride, int v_stride, const uint8_t *image, int vuy_stride, int h_size, int v_size)
Plain C 2VUY to I420 conversion routine.
Definition: yuv2rgb.cpp:1049
#define C_BU
Definition: yuv2rgb.cpp:405
#define FIX(x)
Definition: yuv2rgb.cpp:496
#define psllq_i2r(imm, reg)
Definition: ffmpeg-mmx.h:165
#define R_OI
#define C_RV
Definition: yuv2rgb.cpp:404
void(* conv_2vuy_i420_fun)(uint8_t *py, uint8_t *pu, uint8_t *pv, int y_stride, int u_stride, int v_stride, const uint8_t *image, int vuy_stride, int h_size, int v_size)
Definition: yuv2rgb.h:62
#define psllw_i2r(imm, reg)
Definition: ffmpeg-mmx.h:168
#define psrlw_i2r(imm, reg)
Definition: ffmpeg-mmx.h:185
#define movq_m2r(var, reg)
Definition: ffmpeg-mmx.h:98
#define G_OI
void rgb32_to_yuv420p(unsigned char *lum, unsigned char *cb, unsigned char *cr, unsigned char *alpha, unsigned char *src, int width, int height, int srcwidth)
Convert planar RGB to YUV420.
Definition: yuv2rgb.cpp:502
#define SCALEBITS
Definition: yuv2rgb.cpp:494
unsigned char g
Definition: ParseText.cpp:340