xBRZ/src/xbrz_tools.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262

// ****************************************************************************
// * This file is part of the xBRZ project. It is distributed under           *
// * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0         *
// * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved          *
// *                                                                          *
// * Additionally and as a special exception, the author gives permission     *
// * to link the code of this program with the following libraries            *
// * (or with modified versions that use the same licenses), and distribute   *
// * linked combinations including the two: MAME, FreeFileSync, Snes9x, ePSXe *
// *                                                                          *
// * You must obey the GNU General Public License in all respects for all of  *
// * the code used other than MAME, FreeFileSync, Snes9x, ePSXe.              *
// * If you modify this file, you may extend this exception to your version   *
// * of the file, but you are not obligated to do so. If you do not wish to   *
// * do so, delete this exception statement from your version.                *
// ****************************************************************************

#ifndef XBRZ_TOOLS_H_825480175091875
#define XBRZ_TOOLS_H_825480175091875

#include <cassert>
#include <vector>
#include <algorithm>
#include <type_traits>


namespace xbrz
{
template <uint32_t N> inline
unsigned char getByte(uint32_t val) { return static_cast<unsigned char>((val >> (8 * N)) & 0xff); }

inline unsigned char getAlpha(uint32_t pix) { return getByte<3>(pix); }
inline unsigned char getRed  (uint32_t pix) { return getByte<2>(pix); }
inline unsigned char getGreen(uint32_t pix) { return getByte<1>(pix); }
inline unsigned char getBlue (uint32_t pix) { return getByte<0>(pix); }

inline uint32_t makePixel(uint32_t a, uint32_t r, uint32_t g, uint32_t b) { return (a << 24) | (r << 16) | (g << 8) | b; }
inline uint32_t makePixel(            uint32_t r, uint32_t g, uint32_t b) { return             (r << 16) | (g << 8) | b; }

inline uint32_t rgb555to888(uint16_t pix) { return ((pix & 0x7C00) << 9) | ((pix & 0x03E0) << 6) | ((pix & 0x001F) << 3); }
inline uint32_t rgb565to888(uint16_t pix) { return ((pix & 0xF800) << 8) | ((pix & 0x07E0) << 5) | ((pix & 0x001F) << 3); }

inline uint16_t rgb888to555(uint32_t pix) { return static_cast<uint16_t>(((pix & 0xF80000) >> 9) | ((pix & 0x00F800) >> 6) | ((pix & 0x0000F8) >> 3)); }
inline uint16_t rgb888to565(uint32_t pix) { return static_cast<uint16_t>(((pix & 0xF80000) >> 8) | ((pix & 0x00FC00) >> 5) | ((pix & 0x0000F8) >> 3)); }


using BytePixel = unsigned char[4]; //unspecified byte order
static_assert(std::alignment_of_v<BytePixel> == 1); // :)


template <class PixReader, class PixWriter> inline
void unscaledCopy(PixReader srcReader /* (int x, int y, BytePixel& pix) */,
                  PixWriter trgWriter /* (const BytePixel& pix)         */, int width, int height)
{
    for (int y = 0; y < height; ++y)
        for (int x = 0; x < width; ++x)
        {
            BytePixel pix; //uninitialized
            srcReader(x, y, pix);
            trgWriter(pix);
        }
}


//nearest-neighbor (going over target image - slow for upscaling, since source is read multiple times missing out on cache! Fast for similar image sizes!)
template <class PixReader, class PixWriter>
void nearestNeighborScale(PixReader srcReader /* (int x, int y, BytePixel& pix) */, int srcWidth, int srcHeight,
                          PixWriter trgWriter /* (const BytePixel& pix)         */, int trgWidth, int trgHeight,
                          int yFirst, int yLast)
{
    yFirst = std::max(yFirst, 0);
    yLast  = std::min(yLast, trgHeight);
    if (yFirst >= yLast || srcHeight <= 0 || srcWidth <= 0) return;

    for (int y = yFirst; y < yLast; ++y)
    {
        const int ySrc = srcHeight * y / trgHeight;

        for (int x = 0; x < trgWidth; ++x)
        {
            const int xSrc = srcWidth * x / trgWidth;

            BytePixel pix; //uninitialized
            srcReader(xSrc, ySrc, pix);
            trgWriter(pix);
        }
    }
}


inline
unsigned int uintDivRound(unsigned int num, unsigned int den)
{
    assert(den != 0);
    return (num + den / 2) / den;
}


inline
unsigned char premultiply(unsigned char c, unsigned char alpha)
{
    return static_cast<unsigned char>(uintDivRound(static_cast<unsigned int>(c) * alpha, 255));
    //premultiply/demultiply using int div round is more accurate than int div floor/ceil pair
}


inline
unsigned char demultiply(unsigned char c, unsigned char alpha)
{
    return static_cast<unsigned char>(alpha == 0 ? 0 :
                                      std::clamp(uintDivRound(static_cast<unsigned int>(c) * 255, alpha), 0U, 255U));
}


//caveat: treats alpha channel like regular color! => caller needs to pre/de-multiply alpha!
template <class PixReader, class PixWriter>
void bilinearScaleSimple(PixReader srcReader /* (int x, int y, BytePixel& pix) */, int srcWidth, int srcHeight,
                         PixWriter trgWriter /* (const BytePixel& pix)         */, int trgWidth, int trgHeight,
                         int yFirst, int yLast)
{
    yFirst = std::max(yFirst, 0);
    yLast  = std::min(yLast, trgHeight);
    if (yFirst >= yLast || srcHeight <= 0 || srcWidth <= 0)
        return;

    const double scaleX = static_cast<double>(trgWidth ) / srcWidth;
    const double scaleY = static_cast<double>(trgHeight) / srcHeight;

    //perf notes:
    //    -> double-based calculation is (slightly) faster than float
    //    -> pre-calculation gives significant boost; std::vector<> memory allocation is negligible!
    struct CoeffsX
    {
        int     x1 = 0;
        int     x2 = 0;
        double xx1 = 0;
        double x2x = 0;
    };
    std::vector<CoeffsX> buf(trgWidth);
    for (int x = 0; x < trgWidth; ++x)
    {
        const int x1 = srcWidth * x / trgWidth;
        int x2 = x1 + 1;
        if (x2 == srcWidth)
            --x2;

        const double xx1 = x / scaleX - x1;
        const double x2x = 1 - xx1;

        buf[x] = {x1, x2, xx1, x2x};
    }

    for (int y = yFirst; y < yLast; ++y)
    {
        const int y1 = srcHeight * y / trgHeight;
        int y2 = y1 + 1;
        if (y2 == srcHeight)
            --y2;

        const double yy1 = y / scaleY - y1;
        const double y2y = 1 - yy1;

        for (int x = 0; x < trgWidth; ++x)
        {
            //perf: do NOT "simplify" the variable layout without measurement!
            const CoeffsX& bufX = buf[x];
            const int     x1 = bufX.x1;
            const int     x2 = bufX.x2;
            const double xx1 = bufX.xx1;
            const double x2x = bufX.x2x;

            const double x2xy2y = x2x * y2y;
            const double xx1y2y = xx1 * y2y;
            const double x2xyy1 = x2x * yy1;
            const double xx1yy1 = xx1 * yy1;

            BytePixel pix11; //
            BytePixel pix21; //uninitialized
            BytePixel pix12; //
            BytePixel pix22; //

            srcReader(x1, y1, pix11); //
            srcReader(x2, y1, pix21); //perf: srcReader has to (re-)calculate row using y
            srcReader(x1, y2, pix12); //      => ~7% additional runtime
            srcReader(x2, y2, pix22); //

            const auto interpolate = [&](int offset)
            {
                /* https://en.wikipedia.org/wiki/Bilinear_interpolation
                     (c11(x2 - x) + c21(x - x1)) * (y2 - y ) +
                     (c12(x2 - x) + c22(x - x1)) * (y  - y1)      */
                return static_cast<unsigned char>(pix11[offset] * x2xy2y + pix21[offset] * xx1y2y +
                                                  pix12[offset] * x2xyy1 + pix22[offset] * xx1yy1 + 0.5);
            };
            trgWriter(BytePixel{interpolate(0),
                                interpolate(1),
                                interpolate(2),
                                interpolate(3)});
        }
    }
}


#if 0
//nearest-neighbor (going over source image - fast for upscaling, since source is read only once
template <class PixSrc, class PixTrg, class PixConverter>
void nearestNeighborScaleOverSource(const PixSrc* src, int srcWidth, int srcHeight, int srcPitch /*[bytes]*/,
                                    /**/  PixTrg* trg, int trgWidth, int trgHeight, int trgPitch /*[bytes]*/,
                                    int yFirst, int yLast, PixConverter pixCvrt /*convert PixSrc to PixTrg*/)
{
    static_assert(std::is_integral_v<PixSrc>, "PixSrc* is expected to be cast-able to char*");
    static_assert(std::is_integral_v<PixTrg>, "PixTrg* is expected to be cast-able to char*");

    static_assert(std::is_same_v<decltype(pixCvrt(PixSrc())), PixTrg>, "PixConverter returning wrong pixel format");

    if (srcPitch < srcWidth * static_cast<int>(sizeof(PixSrc))  ||
        trgPitch < trgWidth * static_cast<int>(sizeof(PixTrg)))
    {
        assert(false);
        return;
    }

    yFirst = std::max(yFirst, 0);
    yLast  = std::min(yLast, srcHeight);
    if (yFirst >= yLast || trgWidth <= 0 || trgHeight <= 0) return;

    for (int y = yFirst; y < yLast; ++y)
    {
        //mathematically: ySrc = floor(srcHeight * yTrg / trgHeight)
        // => search for integers in: [ySrc, ySrc + 1) * trgHeight / srcHeight

        //keep within for loop to support MT input slices!
        const int yTrgFirst = ( y      * trgHeight + srcHeight - 1) / srcHeight; //=ceil(y * trgHeight / srcHeight)
        const int yTrgLast  = ((y + 1) * trgHeight + srcHeight - 1) / srcHeight; //=ceil(((y + 1) * trgHeight) / srcHeight)
        const int blockHeight = yTrgLast - yTrgFirst;

        if (blockHeight > 0)
        {
            const PixSrc* srcLine = byteAdvance(src, y         * srcPitch);
            /**/  PixTrg* trgLine = byteAdvance(trg, yTrgFirst * trgPitch);
            int xTrgFirst = 0;

            for (int x = 0; x < srcWidth; ++x)
            {
                const int xTrgLast = ((x + 1) * trgWidth + srcWidth - 1) / srcWidth;
                const int blockWidth = xTrgLast - xTrgFirst;
                if (blockWidth > 0)
                {
                    xTrgFirst = xTrgLast;

                    const auto trgPix = pixCvrt(srcLine[x]);
                    fillBlock(trgLine, trgPitch, trgPix, blockWidth, blockHeight);
                    trgLine += blockWidth;
                }
            }
        }
    }
}
#endif
}

#endif //XBRZ_TOOLS_H_825480175091875