1
#include
<
windows.h
>
2
3
long
int
crv_tab[
256
];
4
long
int
cbu_tab[
256
];
5
long
int
cgu_tab[
256
];
6
7
long
int
cgv_tab[
256
];
8
long
int
tab_76309[
256
];
9
unsigned
char
clp[
1024
];
10
11
12
void
init_dither_tab()
13
{
14
long
int
crv,cbu,cgu,cgv;
15
int
i,ind;
16
17
crv
=
104597
; cbu
=
132201
;
18
cgu
=
25675
; cgv
=
53279
;
19
20
for
(i
=
0
; i
<
256
; i
++
) {
21
crv_tab[i]
=
(i
-
128
)
*
crv;
22
cbu_tab[i]
=
(i
-
128
)
*
cbu;
23
cgu_tab[i]
=
(i
-
128
)
*
cgu;
24
cgv_tab[i]
=
(i
-
128
)
*
cgv;
25
tab_76309[i]
=
76309
*
(i
-
16
);
26
}
27
28
for
(i
=
0
; i
<
384
; i
++
)
29
clp[i]
=
0
;
30
ind
=
384
;
31
for
(i
=
0
;i
<
256
; i
++
)
32
clp[ind
++
]
=
i;
33
ind
=
640
;
34
for
(i
=
0
;i
<
384
;i
++
)
35
clp[ind
++
]
=
255
;
36
}
37
38
39
void
YUV2RGB420(unsigned
char
*
src0,unsigned
char
*
src1,unsigned
char
*
src2,unsigned
char
*
dst_ori,
int
width,
int
height)
40
{
41
int
y1,y2,u,v;
42
unsigned
char
*
py1,
*
py2;
43
int
i,j, c1, c2, c3, c4;
44
unsigned
char
*
d1,
*
d2;
45
46
//
src0=src;
47
//
src1=src+width*height;
48
//
src2=src+width*height+width*height/4;
49
50
py1
=
src0;
51
py2
=
py1
+
width;
52
d1
=
dst_ori;
53
d2
=
d1
+
3
*
width;
54
for
(j
=
0
; j
<
height; j
+=
2
) {
55
for
(i
=
0
; i
<
width; i
+=
2
) {
56
57
u
=
*
src1
++
;
58
v
=
*
src2
++
;
59
60
c1
=
crv_tab[v];
61
c2
=
cgu_tab[u];
62
c3
=
cgv_tab[v];
63
c4
=
cbu_tab[u];
64
65
//
up-left
66
y1
=
tab_76309[
*
py1
++
];
67
*
d1
++
=
clp[
384
+
((y1
+
c1)
>>
16
)];
68
*
d1
++
=
clp[
384
+
((y1
-
c2
-
c3)
>>
16
)];
69
*
d1
++
=
clp[
384
+
((y1
+
c4)
>>
16
)];
70
71
//
down-left
72
y2
=
tab_76309[
*
py2
++
];
73
*
d2
++
=
clp[
384
+
((y2
+
c1)
>>
16
)];
74
*
d2
++
=
clp[
384
+
((y2
-
c2
-
c3)
>>
16
)];
75
*
d2
++
=
clp[
384
+
((y2
+
c4)
>>
16
)];
76
77
//
up-right
78
y1
=
tab_76309[
*
py1
++
];
79
*
d1
++
=
clp[
384
+
((y1
+
c1)
>>
16
)];
80
*
d1
++
=
clp[
384
+
((y1
-
c2
-
c3)
>>
16
)];
81
*
d1
++
=
clp[
384
+
((y1
+
c4)
>>
16
)];
82
83
//
down-right
84
y2
=
tab_76309[
*
py2
++
];
85
*
d2
++
=
clp[
384
+
((y2
+
c1)
>>
16
)];
86
*
d2
++
=
clp[
384
+
((y2
-
c2
-
c3)
>>
16
)];
87
*
d2
++
=
clp[
384
+
((y2
+
c4)
>>
16
)];
88
}
89
d1
+=
3
*
width;
90
d2
+=
3
*
width;
91
py1
+=
width;
92
py2
+=
width;
93
}
94
}
95
96
97
98
//
How to use:
99
//
YUV_TO_RGB24(pY,width,pU,pV,width>>1,pRGBBuf,width,(int)0-height,width*3);
100
typedef UCHAR uint8_t;
101
typedef ULONGLONG uint64_t;
102
103
#define
MAXIMUM_Y_WIDTH 800
104
static
uint64_t mmw_mult_Y
=
0x2568256825682568
;
105
static
uint64_t mmw_mult_U_G
=
0xf36ef36ef36ef36e
;
106
static
uint64_t mmw_mult_U_B
=
0x40cf40cf40cf40cf
;
107
static
uint64_t mmw_mult_V_R
=
0x3343334333433343
;
108
static
uint64_t mmw_mult_V_G
=
0xe5e2e5e2e5e2e5e2
;
109
110
111
static
uint64_t mmb_0x10
=
0x1010101010101010
;
112
static
uint64_t mmw_0x0080
=
0x0080008000800080
;
113
static
uint64_t mmw_0x00ff
=
0x00ff00ff00ff00ff
;
114
115
static
uint64_t mmw_cut_red
=
0x7c007c007c007c00
;
116
static
uint64_t mmw_cut_green
=
0x03e003e003e003e0
;
117
static
uint64_t mmw_cut_blue
=
0x001f001f001f001f
;
118
119
120
void
YUV_TO_RGB24( uint8_t
*
puc_y,
int
stride_y,
121
uint8_t
*
puc_u, uint8_t
*
puc_v,
int
stride_uv,
122
uint8_t
*
puc_out,
int
width_y,
int
height_y,
int
stride_out)
123
{
124
int
y, horiz_count;
125
uint8_t
*
puc_out_remembered;
126
//
int stride_out = width_y * 3;
127
128
if
(height_y
<
0
) {
129
//
we are flipping our output upside-down
130
height_y
=
-
height_y;
131
puc_y
+=
(height_y
-
1
)
*
stride_y ;
132
puc_u
+=
(height_y
/
2
-
1
)
*
stride_uv;
133
puc_v
+=
(height_y
/
2
-
1
)
*
stride_uv;
134
stride_y
=
-
stride_y;
135
stride_uv
=
-
stride_uv;
136
}
137
138
horiz_count
=
-
(width_y
>>
3
);
139
140
for
(y
=
0
; y
<
height_y; y
++
) {
141
if
(y
==
height_y
-
1
) {
142
//
this is the last output line - we need to be careful not to overrun the end of this line
143
uint8_t temp_buff[
3
*
MAXIMUM_Y_WIDTH
+
1
];
144
puc_out_remembered
=
puc_out;
145
puc_out
=
temp_buff;
//
write the RGB to a temporary store
146
}
147
_asm {
148
push eax
149
push ebx
150
push ecx
151
push edx
152
push edi
153
154
mov eax, puc_out
155
mov ebx, puc_y
156
mov ecx, puc_u
157
mov edx, puc_v
158
mov edi, horiz_count
159
160
horiz_loop:
161
162
movd mm2, [ecx]
163
pxor mm7, mm7
164
165
movd mm3, [edx]
166
punpcklbw mm2, mm7
167
168
movq mm0, [ebx]
169
punpcklbw mm3, mm7
170
171
movq mm1, mmw_0x00ff
172
173
psubusb mm0, mmb_0x10
174
175
psubw mm2, mmw_0x0080
176
pand mm1, mm0
177
178
psubw mm3, mmw_0x0080
179
psllw mm1,
3
180
181
psrlw mm0,
8
182
psllw mm2,
3
183
184
pmulhw mm1, mmw_mult_Y
185
psllw mm0,
3
186
187
psllw mm3,
3
188
movq mm5, mm3
189
190
pmulhw mm5, mmw_mult_V_R
191
movq mm4, mm2
192
193
pmulhw mm0, mmw_mult_Y
194
movq mm7, mm1
195
196
pmulhw mm2, mmw_mult_U_G
197
paddsw mm7, mm5
198
199
pmulhw mm3, mmw_mult_V_G
200
packuswb mm7, mm7
201
202
pmulhw mm4, mmw_mult_U_B
203
paddsw mm5, mm0
204
205
packuswb mm5, mm5
206
paddsw mm2, mm3
207
208
movq mm3, mm1
209
movq mm6, mm1
210
211
paddsw mm3, mm4
212
paddsw mm6, mm2
213
214
punpcklbw mm7, mm5
215
paddsw mm2, mm0
216
217
packuswb mm6, mm6
218
packuswb mm2, mm2
219
220
packuswb mm3, mm3
221
paddsw mm4, mm0
222
223
packuswb mm4, mm4
224
punpcklbw mm6, mm2
225
226
punpcklbw mm3, mm4
227
228
//
32-bit shuffle.
229
pxor mm0, mm0
230
231
movq mm1, mm6
232
punpcklbw mm1, mm0
233
234
movq mm0, mm3
235
punpcklbw mm0, mm7
236
237
movq mm2, mm0
238
239
punpcklbw mm0, mm1
240
punpckhbw mm2, mm1
241
242
//
24-bit shuffle and sav
243
movd [eax], mm0
244
psrlq mm0,
32
245
246
movd
3
[eax], mm0
247
248
movd
6
[eax], mm2
249
250
251
psrlq mm2,
32
252
253
movd
9
[eax], mm2
254
255
//
32-bit shuffle.
256
pxor mm0, mm0
257
258
movq mm1, mm6
259
punpckhbw mm1, mm0
260
261
movq mm0, mm3
262
punpckhbw mm0, mm7
263
264
movq mm2, mm0
265
266
punpcklbw mm0, mm1
267
punpckhbw mm2, mm1
268
269
//
24-bit shuffle and sav
270
movd
12
[eax], mm0
271
psrlq mm0,
32
272
273
movd
15
[eax], mm0
274
add ebx,
8
275
276
movd
18
[eax], mm2
277
psrlq mm2,
32
278
279
add ecx,
4
280
add edx,
4
281
282
movd
21
[eax], mm2
283
add eax,
24
284
285
inc edi
286
jne horiz_loop
287
288
pop edi
289
pop edx
290
pop ecx
291
pop ebx
292
pop eax
293
294
emms
295
}
296
297
298
if
(y
==
height_y
-
1
) {
299
//
last line of output - we have used the temp_buff and need to copy
300
int
x
=
3
*
width_y;
//
interation counter
301
uint8_t
*
ps
=
puc_out;
//
source pointer (temporary line store)
302
uint8_t
*
pd
=
puc_out_remembered;
//
dest pointer
303
while
(x
--
)
*
(pd
++
)
=
*
(ps
++
);
//
copy the line
304
}
305
306
puc_y
+=
stride_y;
307
if
(y
%
2
) {
308
puc_u
+=
stride_uv;
309
puc_v
+=
stride_uv;
310
}
311
puc_out
+=
stride_out;
312
}
313
}
314