/* Copyright (C) 2002-2005 RealVNC Ltd.  All Rights Reserved.
 * Copyright (C) 2011 D. R. Commander.  All Rights Reserved.
 * Copyright 2014 Pierre Ossman for Cendio AB
 * Copyright (C) 2015-2025 m-privacy GmbH.  All Rights Reserved.
 * 
 * This is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this software; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
 * USA.
 */

#include <rfb/MPCompressor.h>
#include <rdr/Exception.h>
#include <rfb/Rect.h>
#include <rfb/PixelFormat.h>
#include <rfb/ClientParams.h>
#include <rfb/Configuration.h>

#include <stdio.h>
#include <wels/codec_api.h>
extern "C" {
#include <jpeglib.h>
}
#include <setjmp.h>
#include <zstd.h>

#if !defined(__APPLE__)
#include <jxl/encode.h>
#include <jxl/encode_cxx.h>
#if !defined(WIN32)
#include <sys/random.h>
#include <jxl/thread_parallel_runner.h>
#endif
#endif

#include <cassert>

using namespace rfb;

static rfb::LogWriter vlog("MPCompressor");

IntParameter pressureLevelIncrease("PressureLevelIncrease", "CPU pressure above which to raise pressureLevel", 20, 1, 99);
IntParameter pressureLevelDecrease("PressureLevelDecrease", "CPU pressure below which to reduce pressureLevel", 10, 1, 99);
IntParameter pressureLevelClear("PressureLevelClear", "CPU pressure below which to reset pressureLevel to 0", 3, 1, 99);
IntParameter VNCZstdLevel("VNCZstdLevel", "VNC ZSTD compression level (0-22, 0 for ZSTD default (currently 3))", 0, 0, 22);
BoolParameter MPOptRectSize("MPOptRectSize", "Always try to minimize MP rect sizes", false);
#if !defined(__APPLE__)
IntParameter jpegxlEffort("JPEGXLEffort", "JPEGXL normal encoder effort/speed (1-9, 1:lightning 2:thunder 3:falcon 4:cheetah 5:hare 6:wombat 7:squirrel 8:kitten 9:tortoise)", 2, 1, 9);
IntParameter jpegxlDecodingSpeed("JPEGXLDecodingSpeed", "JPEGXL decoding speed (0-4, 0 slowest, best quality, 4 fastest)", 0, 0, 4);
IntParameter jpegxlDistance("JPEGXLDistance", "JPEGXL distance (0-15, 0 lossless, 15 fastest)", 1, 0, 15);
#if !defined(WIN32)
IntParameter autoLbH264Rate("AutoLbH264Rate", "Rate in n/10 to mix H264 under JPEGLB in Save Bandwidth High mode (0-10, 0 none, 10 all)", 10, 0, 10);
IntParameter autoLbH264MinSizeK("AutoLbH264MinSizeK", "Min Rect size for H264 in Save Bandwidth High mode in KPixel (0-999)", 10, 0, 999);
BoolParameter jpegxlMultiThread("JPEGXLMultiThread", "Enable internal multi threading in libjxl on top of our own multi threading", false);
#endif
#endif

#define MAXPRESSURELEVEL 10

struct TightMPJpegConfiguration {
	int quality;
};

// NOTE: The MP quality and subsampling levels below were obtained
// experimentally by the VirtualGL Project. They represent the approximate
// average compression ratios listed below, as measured across the set of
// every 10th frame in the SPECviewperf 9 benchmark suite.
// Note: Taken from JPEG encoder
// 9 = MP quality 100, no subsampling (ratio ~= 10:1)
//     [this should be lossless, except for round-off error]
// 8 = MP quality 92,  no subsampling (ratio ~= 20:1)
//     [this should be perceptually lossless, based on current research]
// 7 = MP quality 86,  no subsampling (ratio ~= 25:1)
// 6 = MP quality 79,  no subsampling (ratio ~= 30:1)
// 5 = MP quality 77,  4:2:2 subsampling (ratio ~= 40:1)
// 4 = MP quality 62,  4:2:2 subsampling (ratio ~= 50:1)
// 3 = MP quality 42,  4:2:2 subsampling (ratio ~= 60:1)
// 2 = MP quality 41,  4:2:0 subsampling (ratio ~= 70:1)
// 1 = MP quality 29,  4:2:0 subsampling (ratio ~= 80:1)
// 0 = MP quality 15,  4:2:0 subsampling (ratio ~= 100:1)

static const struct TightMPJpegConfiguration jpegConf[10] = {
	{  15 }, // 0
	{  29 }, // 1
	{  41 }, // 2
	{  42 }, // 3
	{  62 }, // 4
	{  77 }, // 5
	{  79 }, // 6
	{  86 }, // 7
	{  92 }, // 8
	{ 100 }  // 9
};

rdr::U8 MPCompressor::cpuPressure = 0;
rdr::U8 MPCompressor::pressureLevel = 0;
bool MPCompressor::saveBandwidth = false;
bool MPCompressor::saveBandwidthHigh = false;
#if !defined(__APPLE__)
bool MPCompressor::clientSupportsTightMPJpegXL = false;
#endif

void MPCompressor::setCpuPressure(rdr::U8 value)
{
  vlog.verbose("setCpuPressure(): got value %u", value);
  cpuPressure = value;
  if (value > pressureLevelIncrease) {
    if (pressureLevel < MAXPRESSURELEVEL) {
      pressureLevel++;
      vlog.debug("setCpuPressure(): value %u above %s, increase pressureLevel to %u", value, pressureLevelIncrease.getValueStr(), pressureLevel);
    } else {
      vlog.debug("setCpuPressure(): value %u above %s, but pressureLevel already at max %u", value, pressureLevelIncrease.getValueStr(), MAXPRESSURELEVEL);
    }
  } else if (pressureLevel > 0) {
    if (value < pressureLevelClear) {
      pressureLevel = 0;
      vlog.debug("setCpuPressure(): value below %s, reset pressureLevel to 0", pressureLevelClear.getValueStr());
    } else if (value < pressureLevelDecrease){
      pressureLevel--;
      vlog.debug("setCpuPressure(): value %u below %s, decrease pressureLevel to %u", value, pressureLevelDecrease.getValueStr(), pressureLevel);
    } else {
      vlog.debug("setCpuPressure(): value %u between %s and %s, keep pressureLevel at %u", value, pressureLevelDecrease.getValueStr(), pressureLevelIncrease.getValueStr(), pressureLevel);
    }
  }
}

void MPCompressor::setSaveBandwidth(bool value)
{
  if (value != saveBandwidth)
    vlog.debug("setSaveBandwidth(): change value to %u", value);
  saveBandwidth = value;
}

void MPCompressor::setSaveBandwidthHigh(bool value)
{
  if (value != saveBandwidthHigh)
    vlog.debug("setSaveBandwidthHigh(): change value to %u", value);
  saveBandwidthHigh = value;
}

#if !defined(__APPLE__)
void MPCompressor::setSupportsTightMPJpegXL(bool value)
{
  if (value != clientSupportsTightMPJpegXL)
    vlog.debug("setSupportsTightMPJpegXL(): change value to %u", value);
  clientSupportsTightMPJpegXL = value;
}
#endif

/* JPEG code ported from JpegCompressor */
struct MP_JPEG_ERROR_MGR {
  struct jpeg_error_mgr pub;
  jmp_buf jmpBuffer;
  char lastError[JMSG_LENGTH_MAX];
};

static void
JpegErrorExit(j_common_ptr jpegCinfo)
{
  MP_JPEG_ERROR_MGR *jpegErr = (MP_JPEG_ERROR_MGR *)jpegCinfo->err;

  (*jpegCinfo->err->output_message)(jpegCinfo);
  longjmp(jpegErr->jmpBuffer, 1);
}

static void
JpegOutputMessage(j_common_ptr jpegCinfo)
{
  MP_JPEG_ERROR_MGR *jpegErr = (MP_JPEG_ERROR_MGR *)jpegCinfo->err;

  (*jpegCinfo->err->format_message)(jpegCinfo, jpegErr->lastError);
}

struct MP_JPEG_DEST_MGR {
  struct jpeg_destination_mgr pub;
  MPCompressor *instance;
  size_t chunkSize;
};

static void
JpegInitDestination(j_compress_ptr jpegCinfo)
{
  MP_JPEG_DEST_MGR *jpegDest = (MP_JPEG_DEST_MGR *)jpegCinfo->dest;
  MPCompressor *mpc = jpegDest->instance;

  jpegDest->pub.next_output_byte = (JOCTET *) mpc->getptr(mpc->length());
  jpegDest->pub.free_in_buffer = jpegDest->chunkSize = mpc->avail();
}

static boolean
JpegEmptyOutputBuffer(j_compress_ptr jpegCinfo)
{
  MP_JPEG_DEST_MGR *jpegDest = (MP_JPEG_DEST_MGR *)jpegCinfo->dest;
  MPCompressor *mpc = jpegDest->instance;

  mpc->setptr(mpc->avail());
  jpegDest->pub.next_output_byte = mpc->getptr(mpc->length());
  jpegDest->pub.free_in_buffer = jpegDest->chunkSize = mpc->avail();
  return TRUE;
}

static void
JpegTermDestination(j_compress_ptr jpegCinfo)
{
  MP_JPEG_DEST_MGR *jpegDest = (MP_JPEG_DEST_MGR *)jpegCinfo->dest;
  MPCompressor *mpc = jpegDest->instance;

  mpc->setptr(jpegDest->chunkSize - jpegDest->pub.free_in_buffer);
}

/* convert RGB to videoFormatI420: IYUV 4:2:0 with 12 Bit/pixel, planar
 * I420: 1(Y) + 1/4(U) + 1/4(V)
 * uneven width or height need extra column or line
 * see https://www.fourcc.org/pixel-format/yuv-i420/
 * code inspired from:
 * https://stackoverflow.com/questions/9465815/rgb-to-yuv420-algorithm-efficiency
 * formula from
 * https://docs.microsoft.com/en-us/windows/win32/medfound/recommended-8-bit-yuv-formats-for-video-rendering
 * https://de.wikipedia.org/wiki/YUV-Farbmodell
 */
static inline rdr::U8 calcY(rdr::U8 r, rdr::U8 g, rdr::U8 b)
{
	return ((66*r + 129*g + 25*b + 128) >> 8) + 16;
}
static inline rdr::U8 calcU(rdr::U8 r, rdr::U8 g, rdr::U8 b)
{
	return ((-38*r + -74*g + 112*b + 128) >> 8) + 128;
}
static inline rdr::U8 calcV(rdr::U8 r, rdr::U8 g, rdr::U8 b)
{
	return ((112*r + -94*g + -18*b + 128) >> 8) + 128;
}
static void rgbToI420(rdr::U8 *destination, rdr::U8 *rgba,
                     const int &width, const int &height, const int pixelsize)
{
	const size_t size = width * height;
	size_t upos = size;
	size_t vpos = upos + upos / 4 + (width & 1) * height + (height & 1) * width;

	#pragma GCC ivdep
	for( int y = 0; y < height; y++ ) {
		const size_t line = y*width;

		#pragma GCC ivdep
		for( int x = 0; x < width; x++ ) {
			const rdr::U8 r = rgba[pixelsize*(line+x)];
			const rdr::U8 g = rgba[pixelsize*(line+x)+1];
			const rdr::U8 b = rgba[pixelsize*(line+x)+2];
			destination[line+x] = calcY(r, g, b);
		}
	}
	#pragma GCC ivdep
	for( int y = 0; y < height; y += 2 ) {
		const size_t line = y*width;

		#pragma GCC ivdep
		for( int x = 0; x < width; x += 2 ) {
			const rdr::U8 r = rgba[pixelsize*(line+x)];
			const rdr::U8 g = rgba[pixelsize*(line+x)+1];
			const rdr::U8 b = rgba[pixelsize*(line+x)+2];
			destination[upos++] = calcU(r, g, b);
			destination[vpos++] = calcV(r, g, b);
		}
	}
}


MPCompressor::MPCompressor(int bufferLen) : MemOutStream(bufferLen),
	welsVideoFormat(videoFormatI420), welsComplexity(MEDIUM_COMPLEXITY),
	zstdCctx(NULL)
{
	classLog = &vlog;
	classMemLog = &vlog;

#pragma GCC ivdep
	for (int i = MP_COMPRESSION_MIN; i <= MP_COMPRESSION_MAX; i++) {
		compressionCount[i] = 0;
		optimizationCount[i] = 0;
		totalBytes[i] = 0;
		totalCBytes[i] = 0;
		minCompressed[i] = 100000;
		maxCompressed[i] = 0;
	}

	memset (&welsParam, 0, sizeof(welsParam));
	welsParam.iUsageType = SCREEN_CONTENT_REAL_TIME;
	welsParam.fMaxFrameRate = 24;
	welsParam.iTargetBitrate = 5000000;

	jpegCinfo = new jpeg_compress_struct;
	jpegErr = new struct MP_JPEG_ERROR_MGR;
	jpegCinfo->err = jpeg_std_error(&jpegErr->pub);
	snprintf(jpegErr->lastError, JMSG_LENGTH_MAX, "No error");
	jpegErr->pub.error_exit = JpegErrorExit;
	jpegErr->pub.output_message = JpegOutputMessage;
	if(setjmp(jpegErr->jmpBuffer)) {
		// this will execute if libjpeg has an error
		throw rdr::Exception("%s", jpegErr->lastError);
	}
	jpeg_create_compress(jpegCinfo);
	jpegDest = new struct MP_JPEG_DEST_MGR;
	jpegDest->pub.init_destination = JpegInitDestination;
	jpegDest->pub.empty_output_buffer = JpegEmptyOutputBuffer;
	jpegDest->pub.term_destination = JpegTermDestination;
	jpegDest->instance = this;
	jpegCinfo->dest = (struct jpeg_destination_mgr *)jpegDest;

#if !defined(__APPLE__)
	jpegxlEnc = JxlEncoderCreate(nullptr);
	vlog.info("JPEGXL normal encoder effort/speed %u, decoding speed %u", (int) jpegxlEffort, (int) jpegxlDecodingSpeed);
#if !defined(WIN32)
	if (jpegxlMultiThread) {
		size_t jxlNumThreads = JxlThreadParallelRunnerDefaultNumWorkerThreads();
		vlog.debug("JPEGXL internal compression uses %lu threads", jxlNumThreads);
		jpegxlRunner = JxlThreadParallelRunnerCreate(nullptr, jxlNumThreads);
	} else {
		jpegxlRunner = NULL;
	}
#endif
	jpegxlColorEncoding = {};
	JxlColorEncodingSetToSRGB(&jpegxlColorEncoding, false);
	JxlEncoderInitBasicInfo(&jpegxlBasicInfo);
	jpegxlBasicInfo.bits_per_sample = 8;
	jpegxlBasicInfo.exponent_bits_per_sample = 0;
	jpegxlBasicInfo.num_color_channels = 3;
	jpegxlBasicInfo.alpha_bits = 8;
	jpegxlBasicInfo.alpha_exponent_bits = 0;
	jpegxlBasicInfo.num_extra_channels = 1;
	jpegxlBasicInfo.uses_original_profile = JXL_FALSE;
#endif

	if (VNCZstdLevel > 0)
		vlog.info("ZSTD compression level set to %u", (int) VNCZstdLevel);
}

MPCompressor::~MPCompressor(void)
{
#if !defined(__APPLE__)
#if !defined(WIN32)
	if (jpegxlMultiThread) {
		JxlThreadParallelRunnerDestroy(jpegxlRunner);
	}
#endif
	JxlEncoderDestroy(jpegxlEnc);
#endif

	if(setjmp(jpegErr->jmpBuffer)) {
		// this will execute if libjpeg has an error
		return;
	}
	jpeg_destroy_compress(jpegCinfo);
	delete jpegErr;
	delete jpegDest;
	delete jpegCinfo;

	if (zstdCctx) {
		ZSTD_freeCCtx(zstdCctx);
	}

#if !defined(__APPLE__) && !defined(WIN32)
	if (vlog.getLevel() >= 100) {
		unsigned compressionCountSum = 0;
		rdr::U64 totalBytesSum = 0;
		rdr::U64 totalCBytesSum = 0;
		int minCompressedTotal = 100000;
		int maxCompressedTotal = 0;
		int optimizationCountSum = 0;
		for (int i = MP_COMPRESSION_MIN; i <= MP_COMPRESSION_MAX; i++) {
			if (compressionCount[i] > 0 && totalBytes[i] > 0) {
				compressionCountSum += compressionCount[i];
				vlog.debug("thread %lu: compression %u %s: %u calls compressed %llu Bytes to %llu Bytes with ratio %llu%%, average rect size %llu uncompressed and %llu compressed, min/max compressed size %u/%u, optimizationCount %u", gettid(), i, mpCompressionName(i), compressionCount[i], totalBytes[i], totalCBytes[i], 100 * totalCBytes[i] / totalBytes[i], totalBytes[i]/compressionCount[i], totalCBytes[i]/compressionCount[i], minCompressed[i], maxCompressed[i], optimizationCount[i]);
				totalBytesSum += totalBytes[i];
				totalCBytesSum += totalCBytes[i];
				optimizationCountSum += optimizationCount[i];
				if (minCompressed[i] < minCompressedTotal)
					minCompressedTotal = minCompressed[i];
				if (maxCompressed[i] > maxCompressedTotal)
					maxCompressedTotal = maxCompressed[i];
			}
		}
		if (compressionCountSum > 0 && totalBytesSum > 0 && totalCBytesSum > 0)
			vlog.debug("thread %lu: %u calls compressed %llu Bytes to %llu Bytes with ratio %llu%%, average rect size %llu uncompressed and %llu compressed, min/max compressed size %u/%u, optimizationCount %u", gettid(), compressionCountSum, totalBytesSum, totalCBytesSum, 100 * totalCBytesSum / totalBytesSum, totalBytesSum/compressionCountSum, totalCBytesSum/compressionCountSum, minCompressedTotal, maxCompressedTotal, optimizationCountSum);
	}
#endif
	resetClassLog();
	resetClassMemLog();
}

static inline int i420DataSize(int mpWidth, int mpHeight)
{
	return mpWidth * mpHeight * 3 / 2 + 2 * ((mpWidth & 1) * mpHeight + (mpHeight & 1) * mpWidth);
}

#if !defined(WIN32)
bool MPCompressor::compress(const rdr::U8 *buf, int stride, const Rect& r,
	const PixelFormat& pf, int mpLevel, int mpCompression)
{
	const int w = r.width();
	const int h = r.height();
	const int rectSize = w * h;
	int mpWidth;
	int mpHeight;
//	const int mpCompressionWanted = mpCompression;

	ISVCEncoder* welsEncoder = NULL;

#if !defined(__APPLE__)
	int jpegxlEffortUsed = jpegxlEffort;
	int jpegxlDecodingSpeedUsed = jpegxlDecodingSpeed;
#endif

	const int srcPixelsize = 4;
	/* mpPixelsize may only be 3 or 4 */
	int mpPixelsize;
	int mpStride;
	rdr::U8 * data;
	int dataSize = 0;
	rdr::U8 * tmpBuf = NULL;
	bool is888;
	bool wExtra = false;
	bool hExtra = false;
	bool needI420 = false;
	bool direct = false;
	bool wantOptSize = MPOptRectSize;

	is888 = pf.is888();
	if (stride == 0)
		stride = w;

	if (mpLevel < 0 || mpLevel > 9) {
		vlog.error("invalid mpLevel %i, setting to 9", mpLevel);
		mpLevel = 9;
	}

	if (mpCompression == MP_COMPRESSION_AUTO) {
		if (saveBandwidthHigh) {
			if (rectSize < autoLbH264MinSizeK * 1000) {
				mpCompression = MP_COMPRESSION_JPEGLB;
			} else {
				int autoLbH264RateUsed = autoLbH264Rate;

				if (pressureLevel > 0) {
					if (autoLbH264RateUsed > pressureLevel)
						autoLbH264RateUsed -= pressureLevel;
					else
						autoLbH264RateUsed = 0;
				}
				if (autoLbH264RateUsed == 10) {
					mpCompression = MP_COMPRESSION_H264;
				} else if (autoLbH264RateUsed == 0) {
					mpCompression = MP_COMPRESSION_JPEGLB;
				} else {
					rdr::U8 randByte = 0;

					getrandom(&randByte, 1, 0);
					if ((int) randByte * 10 / 256 < autoLbH264RateUsed) {
						mpCompression = MP_COMPRESSION_H264;
					} else {
						mpCompression = MP_COMPRESSION_JPEGLB;
					}
				}
			}
		} else if (saveBandwidth) {
//			vlog.verbose("AUTO saveBandwidth: w: %u, h: %u, choosing JPEGLB", w, h);
			mpCompression = MP_COMPRESSION_JPEGLB;
		} else {
			if (pressureLevel == 0)
				mpCompression = MP_COMPRESSION_JPEG;
			else
				mpCompression = MP_COMPRESSION_JPEGLB;
		}
	}

	if (mpCompression == MP_COMPRESSION_H264) {
		/* openh264 has min. size 16x16, we increase to even numbers */
		if (h < 15 || w < 15) {
#if !defined(__APPLE__)
			if (clientSupportsTightMPJpegXL)
				mpCompression = MP_COMPRESSION_JPEGXL;
			else
#endif
				mpCompression = MP_COMPRESSION_JPEGLB;
		} else {
			wantOptSize = true;
		}
	}
	if (mpCompression == MP_COMPRESSION_JPEGLB) {
		mpCompression = MP_COMPRESSION_JPEG;
		if (mpLevel > 0)
			mpLevel--;
	}
	/* do not waste resources for compression of small rects,
	   result will be bigger anyway
	 */
	/* JPEG has min usedSize of 640B, try to save time and space (4B/pixel) */
	if (mpCompression == MP_COMPRESSION_JPEG) {
		if (mpLevel == 9) {
			if (rectSize <= 20)
				mpCompression = MP_COMPRESSION_RAW;
			else if (rectSize <= 200)
				mpCompression = MP_COMPRESSION_ZSTD;
			else
				wantOptSize = true;
		} else {
			if (rectSize <= 12)
				mpCompression = MP_COMPRESSION_RAWI420;
			else if (rectSize <= 400)
				mpCompression = MP_COMPRESSION_I420ZSTD;
			else
				wantOptSize = true;
		}
#if !defined(__APPLE__)
	/* JPEGXL has min usedSize of 88B, but small sizes are inefficient: */
	/* try to optimize time and space (4B/pixel) */
	} else if (mpCompression == MP_COMPRESSION_JPEGXL) {
		if (rectSize <= 20)
			mpCompression = MP_COMPRESSION_RAW;
		else if (mpLevel < 9 && (h < 4 || rectSize < 200))
			mpCompression = MP_COMPRESSION_I420ZSTD;
		else if (h < 4 || rectSize <= 100)
			mpCompression = MP_COMPRESSION_ZSTD;
		else
			wantOptSize = true;
#endif
	}

	mpWidth = w;
	mpHeight = h;

	const size_t headerStartPos = length();
	/* write out: header of size MP_HEADER_SIZE */
	writeU8(mpCompression);
	/* flags: highest 4 Bits for mpLevel, lower 4 Bits currently unused */
	writeU8(mpLevel << 4);
	const size_t startPos = length();

	if (mpCompression == MP_COMPRESSION_RAW) {
		mpPixelsize = 4;
		mpStride = mpWidth * mpPixelsize;
		dataSize = mpHeight * mpStride;
		if (is888) {
//			vlog.verbose("RAW: directly use OutStream buffer with size %u", dataSize);
			direct = true;
			data = getptr(dataSize);
		} else {
			data = (rdr::U8*) malloc(dataSize);
			if (!data) {
				vlog.error("malloc(%u) for RAW failed", dataSize);
				return false;
			}
		}
	} else if (mpCompression == MP_COMPRESSION_JPEG) {
		const int quality = jpegConf[mpLevel].quality;

#ifdef JCS_EXTENSIONS
		mpPixelsize = 4;
#else
		mpPixelsize = 3;
#endif
		mpStride = mpWidth * mpPixelsize;
//		vlog.verbose("JPEG: w: %u, h: %u, stride: %u, mpWidth: %u, mpStride: %u", w, h, stride, mpWidth, mpStride);
		dataSize = mpHeight * mpStride;
		if(setjmp(jpegErr->jmpBuffer)) {
			// this will execute if libjpeg has an error
			jpeg_abort_compress(jpegCinfo);
			throw rdr::Exception("%s", jpegErr->lastError);
		}
		jpegCinfo->image_width = mpWidth;
		jpegCinfo->image_height = mpHeight;
#ifdef JCS_EXTENSIONS
		jpegCinfo->in_color_space = JCS_EXT_RGBX;
#else
		jpegCinfo->in_color_space = JCS_RGB;
#endif
		jpegCinfo->input_components = mpPixelsize;
		jpeg_set_defaults(jpegCinfo);
		jpeg_set_quality(jpegCinfo, quality, TRUE);
		if (quality >= 96)
			jpegCinfo->dct_method = JDCT_ISLOW;
		else
			jpegCinfo->dct_method = JDCT_FASTEST;
		if (quality >= 79) {
			jpegCinfo->comp_info[0].h_samp_factor = 1;
			jpegCinfo->comp_info[0].v_samp_factor = 1;
		} else if (quality <= 42) {
			jpegCinfo->comp_info[0].h_samp_factor = 2;
			jpegCinfo->comp_info[0].v_samp_factor = 2;
		} else {
			jpegCinfo->comp_info[0].h_samp_factor = 2;
			jpegCinfo->comp_info[0].v_samp_factor = 1;
		}
#ifdef JCS_EXTENSIONS
		if (is888) {
			JSAMPROW *jpegRowPointer;

			mpStride = stride * srcPixelsize;
//			vlog.verbose2("JPEG direct: w: %u, h: %u, stride: %u, mpWidth: %u, mpStride: %u", w, h, stride, mpWidth, mpStride);
			jpegRowPointer = new JSAMPROW[mpHeight];
#pragma GCC ivdep
			for (int y = 0; y < mpHeight; y++)
				jpegRowPointer[y] = (JSAMPROW) buf + y * mpStride;
			jpeg_start_compress(jpegCinfo, TRUE);
			while (jpegCinfo->next_scanline < jpegCinfo->image_height)
				jpeg_write_scanlines(jpegCinfo, &jpegRowPointer[jpegCinfo->next_scanline],
					jpegCinfo->image_height - jpegCinfo->next_scanline);
			jpeg_finish_compress(jpegCinfo);
			delete[] jpegRowPointer;
			ssize_t usedSize = length() - startPos;
			if (wantOptSize && mpLevel < 9) {
				const int i420Size = i420DataSize(w, h) / 2;

				if (usedSize > i420Size + MP_COMPRESSION_MIN_SAVE_SIZE) {
					vlog.verbose("wantOptSize: mpCompression: %s, mpLevel %u, w: %u, h: %u, stride: %u, mpWidth: %u, usedSize %lu > i420Size %u + %u, repeat with I420ZSTD", mpCompressionName(mpCompression), mpLevel, w, h, stride, mpWidth, usedSize, i420Size, MP_COMPRESSION_MIN_SAVE_SIZE);
					reposition(headerStartPos);
					optimizationCount[MP_COMPRESSION_I420ZSTD]++;
					return compress(buf, stride, r, pf, mpLevel, MP_COMPRESSION_I420ZSTD);
				}
			} else {
				const int zstdEstSize = rectSize * 2;

				if (usedSize > zstdEstSize + MP_COMPRESSION_MIN_SAVE_SIZE) {
					vlog.verbose("optimize: mpCompression: %s, mpLevel: %u, w: %u, h: %u, stride: %u, mpWidth: %u, usedSize %lu > zstdEstSize %u + %u, repeat with ZSTD", mpCompressionName(mpCompression), mpLevel, w, h, stride, mpWidth, usedSize, zstdEstSize, MP_COMPRESSION_MIN_SAVE_SIZE);
					reposition(headerStartPos);
					optimizationCount[MP_COMPRESSION_ZSTD]++;
					return compress(buf, stride, r, pf, mpLevel, MP_COMPRESSION_ZSTD);
				}
			}
			if (vlog.getLevel() >= 100) {
				compressionCount[mpCompression]++;
				totalBytes[mpCompression] += mpWidth * mpHeight * srcPixelsize;
				totalCBytes[mpCompression] += usedSize;
				if (usedSize < minCompressed[mpCompression])
					minCompressed[mpCompression] = usedSize;
				if (usedSize > maxCompressed[mpCompression])
					maxCompressed[mpCompression] = usedSize;
			}
			return true;
		}
#endif
//		vlog.verbose2("JPEG indirect: w: %u, h: %u, stride: %u, mpWidth: %u, mpStride: %u", w, h, stride, mpWidth, mpStride);
		data = (rdr::U8*) malloc(dataSize);
		if (!data) {
			vlog.error("malloc(%u) for JPEG failed", dataSize);
			return false;
		}
#if !defined(__APPLE__)
	} else if (mpCompression == MP_COMPRESSION_JPEGXL) {
		JxlEncoderReset(jpegxlEnc);
#if !defined(__APPLE__) && !defined(WIN32)
		if (jpegxlMultiThread) {
			jpegxlStatus = JxlEncoderSetParallelRunner(jpegxlEnc,
					JxlThreadParallelRunner,
					jpegxlRunner);
			if (JXL_ENC_SUCCESS != jpegxlStatus) {
				vlog.error("JxlEncoderSetParallelRunner() failed with error %i", JxlEncoderGetError(jpegxlEnc));
				return false;
			}
		}
#endif
		jpegxlBasicInfo.xsize = mpWidth;
		jpegxlBasicInfo.ysize = mpHeight;
		jpegxlBasicInfo.intrinsic_xsize = mpWidth;
		jpegxlBasicInfo.intrinsic_ysize = mpHeight;
		jpegxlStatus = JxlEncoderSetBasicInfo(jpegxlEnc, &jpegxlBasicInfo);
		if (JXL_ENC_SUCCESS != jpegxlStatus) {
			vlog.error("JxlEncoderSetBasicInfo() failed with error %i", JxlEncoderGetError(jpegxlEnc));
			return false;
		}
		jpegxlStatus = JxlEncoderSetColorEncoding(jpegxlEnc, &jpegxlColorEncoding);
		if (JXL_ENC_SUCCESS != jpegxlStatus) {
			vlog.error("JxlEncoderSetColorEncoding() failed with error %i", JxlEncoderGetError(jpegxlEnc));
			return false;
		}
		jpegxlFrameSettings = JxlEncoderFrameSettingsCreate(jpegxlEnc, nullptr);
		jpegxlStatus = JxlEncoderSetFrameDistance(jpegxlFrameSettings, jpegxlDistance);
		if (JXL_ENC_SUCCESS != jpegxlStatus) {
			vlog.error("JxlEncoderSetFrameDistance() failed with error %i", JxlEncoderGetError(jpegxlEnc));
			return false;
		}
		jpegxlStatus = JxlEncoderFrameSettingsSetOption(jpegxlFrameSettings, JXL_ENC_FRAME_SETTING_EFFORT, jpegxlEffortUsed);
		if (JXL_ENC_SUCCESS != jpegxlStatus) {
			vlog.error("JxlEncoderFrameSettingsSetOption(JXL_ENC_FRAME_SETTING_EFFORT) failed with error %i", JxlEncoderGetError(jpegxlEnc));
			return false;
		}
		jpegxlStatus = JxlEncoderFrameSettingsSetOption(jpegxlFrameSettings, JXL_ENC_FRAME_SETTING_DECODING_SPEED, jpegxlDecodingSpeedUsed);
		if (JXL_ENC_SUCCESS != jpegxlStatus) {
			vlog.error("JxlEncoderFrameSettingsSetOption(JXL_ENC_FRAME_SETTING_DECODING_SPEED) failed with error %i", JxlEncoderGetError(jpegxlEnc));
			return false;
		}
//		vlog.debug("JPEGXL: speed_tier: %u, decoding_speed_tier: %u, distance: %.1f", (int) jpegxlFrameSettings->values.cparams.speed_tier, (int) jpegxlFrameSettings->values.cparams.decoding_speed_tier, (float) jpegxlFrameSettings->values.cparams.butteraugli_distance);
#if 0 // this triggers a double free bug in libjxl
		jpegxlStatus = JxlEncoderFrameSettingsSetOption(jpegxlFrameSettings, JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING, 8);
		if (JXL_ENC_SUCCESS != jpegxlStatus) {
			vlog.error("JxlEncoderFrameSettingsSetOption(JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING) failed with error %i", JxlEncoderGetError(jpegxlEnc));
			return false;
		}
#endif
		if (is888) {
			dataSize = stride * srcPixelsize * h;
//			vlog.verbose2("JPEGXL: direct: w: %u, h: %u, stride: %u, dataSize: %u, effort: %u, decoding speed: %u, distance: %u", w, h, stride, dataSize, jpegxlEffortUsed, jpegxlDecodingSpeedUsed, (int) jpegxlDistance);
			/* last param = data align in bytes, 0 = not needed, might be stride */
			/* JXL_FULLALPHA_ENDIAN as endianness is an m-privacy hack to */
			/* work around broken alpha bytes in VNC, it forces alpha to max. */
			jpegxlPixelFormat.num_channels = (uint32_t) srcPixelsize;
			jpegxlPixelFormat.data_type = JXL_TYPE_UINT8;
			jpegxlPixelFormat.endianness = JXL_FULLALPHA_ENDIAN;
			jpegxlPixelFormat.align = (size_t) stride * srcPixelsize;
			jpegxlStatus = JxlEncoderAddImageFrame(jpegxlFrameSettings,
				&jpegxlPixelFormat,
				(void*) buf,
				dataSize);
			if (JXL_ENC_SUCCESS != jpegxlStatus) {
				vlog.error("JxlEncoderAddImageFrame() failed with error %i", JxlEncoderGetError(jpegxlEnc));
				return false;
			}
			JxlEncoderCloseInput(jpegxlEnc);
			uint8_t* nextOut = getptr(2048);
			uint8_t* startOut = nextOut;
			size_t availOut = avail();
			JxlEncoderStatus jpegxlProcessResult = JXL_ENC_NEED_MORE_OUTPUT;
			while (jpegxlProcessResult == JXL_ENC_NEED_MORE_OUTPUT) {
				jpegxlProcessResult = JxlEncoderProcessOutput(jpegxlEnc, &nextOut, &availOut);
				if (jpegxlProcessResult == JXL_ENC_NEED_MORE_OUTPUT) {
					size_t offset = nextOut - startOut;
					vlog.debug("JPEGXL: need more bytes at offset %lu, avail %lu, increasing", offset, availOut);
					startOut = getptr(2 * avail());
					nextOut = startOut + offset;
					availOut = avail() - offset;
					vlog.debug("JPEGXL: needed more bytes, increased to avail of %lu", availOut);
					if (availOut < 2048) {
						vlog.error("failed to increase buffer!");
						return false;
					}
				}
			}
			if (JXL_ENC_SUCCESS != jpegxlProcessResult) {
				vlog.error("JxlEncoderProcessOutput() failed with error %i", JxlEncoderGetError(jpegxlEnc));
				return false;
			}
			ssize_t usedSize = nextOut - startOut;
			setptr(usedSize);
			if (wantOptSize && mpLevel < 9) {
				const int i420Size = i420DataSize(w, h) / 2;

				if (usedSize > i420Size + MP_COMPRESSION_MIN_SAVE_SIZE) {
					vlog.verbose2("wantOptSize: mpCompression: %s, mpLevel %u, w: %u, h: %u, stride: %u, mpWidth: %u, usedSize %lu > i420Size %u + %u, repeat with I420ZSTD", mpCompressionName(mpCompression), mpLevel, w, h, stride, mpWidth, usedSize, i420Size, MP_COMPRESSION_MIN_SAVE_SIZE);
					reposition(headerStartPos);
					optimizationCount[MP_COMPRESSION_I420ZSTD]++;
					return compress(buf, stride, r, pf, mpLevel, MP_COMPRESSION_I420ZSTD);
				}
			} else {
				const int zstdEstSize = rectSize * 2;

				if (usedSize > zstdEstSize + MP_COMPRESSION_MIN_SAVE_SIZE) {
					vlog.verbose("optimize: mpCompression: %s, mpLevel: %u, w: %u, h: %u, stride: %u, mpWidth: %u, usedSize %lu > zstdEstSize %u + %u, repeat with ZSTD", mpCompressionName(mpCompression), mpLevel, w, h, stride, mpWidth, usedSize, zstdEstSize, MP_COMPRESSION_MIN_SAVE_SIZE);
					reposition(headerStartPos);
					optimizationCount[MP_COMPRESSION_ZSTD]++;
					return compress(buf, stride, r, pf, mpLevel, MP_COMPRESSION_ZSTD);
				}
			}
			if (vlog.getLevel() >= 100) {
				compressionCount[mpCompression]++;
				totalBytes[mpCompression] += mpWidth * mpHeight * srcPixelsize;
				totalCBytes[mpCompression] += usedSize;
				if (usedSize < minCompressed[mpCompression])
					minCompressed[mpCompression] = usedSize;
				if (usedSize > maxCompressed[mpCompression])
					maxCompressed[mpCompression] = usedSize;
			}
			return true;
		}
		mpPixelsize = 4;
		mpStride = mpWidth * mpPixelsize;
//		vlog.verbose2("JPEGXL: w: %u, h: %u, stride: %u, mpWidth: %u, mpStride: %u, effort: %u, decoding speed: %u, distance: %u", w, h, stride, mpWidth, mpStride, jpegxlEffortUsed, jpegxlDecodingSpeedUsed, (int) jpegxlDistance);
		dataSize = mpHeight * mpStride;
		/* last param = data align in bytes, 0 = not needed, might be stride */
		/* JXL_FULLALPHA_ENDIAN as endianness is an m-privacy hack to */
		/* work around broken alpha bytes in VNC, it forces alpha to max. */
		jpegxlPixelFormat.num_channels = (uint32_t) srcPixelsize;
		jpegxlPixelFormat.data_type = JXL_TYPE_UINT8;
		jpegxlPixelFormat.endianness = JXL_FULLALPHA_ENDIAN;
		jpegxlPixelFormat.align = (size_t) 0;
		vlog.debug("JPEGXL: converting: w: %u, h: %u, stride: %u, mpWidth: %u, mpStride: %u", w, h, stride, mpWidth, mpStride);
		data = (rdr::U8*) malloc(dataSize);
		if (!data) {
			vlog.error("malloc(%u) for JPEGXL failed", dataSize);
			return false;
		}
#endif
	} else if (mpCompression == MP_COMPRESSION_H264) {
		int rv;

		/* only supports even numbers for width and height */
		if (mpWidth & 1) {
			mpWidth++;
			wExtra = true;
		}
		if (mpHeight & 1) {
			mpHeight++;
			hExtra = true;
		}
		/* mpLevel setting LOW, MEDIUM, HIGH, better is slower */
		if (mpLevel >= 7)
			welsComplexity = HIGH_COMPLEXITY;
		else if (mpLevel <= 3)
			welsComplexity = LOW_COMPLEXITY;
		else
			welsComplexity = MEDIUM_COMPLEXITY;
		mpPixelsize = 3;
		mpStride = mpWidth * mpPixelsize;
		needI420 = true;
		/* U and V each need 1/4 of pixels plus extra column for
		 * uneven mpWidth and extra line for uneven mpHeight
		 */
		dataSize = i420DataSize(mpWidth, mpHeight);
		data = (rdr::U8*) malloc(dataSize);
		if (!data) {
			vlog.error("malloc(%u) for H264 I420 data failed", dataSize);
			return false;
		}
		rv = WelsCreateSVCEncoder(&welsEncoder);
		if (rv != cmResultSuccess || welsEncoder == NULL) {
			vlog.error("WelsCreateSVCEncoder() failed with error %i", rv);
			free(data);
			return false;
		}
		welsParam.iPicWidth = mpWidth;
		welsParam.iPicHeight = mpHeight;
		rv = welsEncoder->Initialize(&welsParam);
		if (rv != cmResultSuccess) {
			vlog.error("welsEncoder->Initialize() failed with error %i", rv);
			free(data);
			WelsDestroySVCEncoder(welsEncoder);
			return false;
		}
		rv = welsEncoder->SetOption(ENCODER_OPTION_DATAFORMAT, &welsVideoFormat);
		if (rv != cmResultSuccess) {
			vlog.error("elsEncoder->SetOption(ENCODER_OPTION_DATAFORMAT) failed with error %i", rv);
			free(data);
			welsEncoder->Uninitialize();
			WelsDestroySVCEncoder(welsEncoder);
			return false;
		}
		rv = welsEncoder->SetOption(ENCODER_OPTION_COMPLEXITY, &welsComplexity);
		if (rv != cmResultSuccess) {
			vlog.error("elsEncoder->SetOption(ENCODER_OPTION_COMPLEXITY) failed with error %i", rv);
			free(data);
			welsEncoder->Uninitialize();
			WelsDestroySVCEncoder(welsEncoder);
			return false;
		}
		rv = welsEncoder->SetOption(ENCODER_OPTION_SVC_ENCODE_PARAM_BASE, &welsParam);
		if (rv != cmResultSuccess) {
			vlog.error("elsEncoder->SetOption(ENCODER_OPTION_SVC_ENCODE_PARAM_BASE) failed with error %i", rv);
			free(data);
			welsEncoder->Uninitialize();
			WelsDestroySVCEncoder(welsEncoder);
			return false;
		}
	} else if (mpCompression == MP_COMPRESSION_RAWI420) {
		needI420 = true;
		mpPixelsize = 3;
		mpStride = mpWidth * mpPixelsize;
//		vlog.verbose("RAWI420: w: %u, h: %u, stride: %u, mpWidth: %u, mpStride: %u", w, h, stride, mpWidth, mpStride);
		/* U and V each need 1/4 of pixels plus extra column for
		 * uneven mpWidth and extra line for uneven mpHeight
		 */
		dataSize = i420DataSize(mpWidth, mpHeight);
		if (is888) {
//			vlog.verbose("RAWI420: directly use OutStream buffer with size %u", dataSize);
			direct = true;
			data = getptr(dataSize);
		} else {
			data = (rdr::U8*) malloc(dataSize);
			if (!data) {
				vlog.error("malloc(%u) for RAWI420 failed", dataSize);
				return false;
			}
		}
	} else if (mpCompression == MP_COMPRESSION_ZSTD) {
		mpPixelsize = 4;
		mpStride = mpWidth * mpPixelsize;
		dataSize = mpHeight * mpStride;
		if (!zstdCctx) {
			zstdCctx = ZSTD_createCCtx();
			if (!zstdCctx) {
				vlog.error("ZSTD_createCCtx() failed");
				return false;
			}
		}
		data = (rdr::U8*) malloc(dataSize);
		if (!data) {
			vlog.error("malloc(%u) for ZSTD failed", dataSize);
			return false;
		}
	} else if (mpCompression == MP_COMPRESSION_I420ZSTD) {
		needI420 = true;
		wantOptSize = true;
		mpPixelsize = 3;
		mpStride = mpWidth * mpPixelsize;
//		vlog.verbose("I420ZSTD: w: %u, h: %u, stride: %u, mpWidth: %u, mpStride: %u", w, h, stride, mpWidth, mpStride);
		if (!zstdCctx) {
			zstdCctx = ZSTD_createCCtx();
			if (!zstdCctx) {
				vlog.error("ZSTD_createCCtx() failed");
				return false;
			}
		}
		/* U and V each need 1/4 of pixels plus extra column for
		 * uneven mpWidth and extra line for uneven mpHeight
		 */
		dataSize = i420DataSize(mpWidth, mpHeight);
		data = (rdr::U8*) malloc(dataSize);
		if (!data) {
			vlog.error("malloc(%u) for I420ZSTD I420 data failed", dataSize);
			return false;
		}
	} else {
		vlog.error("invalid MP compression %u", mpCompression);
		return false;
	}
	if (!is888) {
		vlog.verbose("not is888, use tmpBuf: w: %u, h: %u, stride: %u, mpWidth: %u, mpStride: %u", w, h, stride, mpWidth, mpStride);
		tmpBuf = (rdr::U8*) malloc(rectSize * srcPixelsize);
		if (!tmpBuf) {
			vlog.error("malloc for tmpBuf failed");
			if (mpCompression == MP_COMPRESSION_RAW) {
				if (!direct)
					free(data);
			} else if (mpCompression == MP_COMPRESSION_JPEG) {
				free(data);
#if !defined(__APPLE__)
			} else if (mpCompression == MP_COMPRESSION_JPEGXL) {
				free(data);
#endif
			} else if (mpCompression == MP_COMPRESSION_H264) {
				free(data);
				welsEncoder->Uninitialize();
				WelsDestroySVCEncoder(welsEncoder);
			} else if (mpCompression == MP_COMPRESSION_RAWI420) {
				if (!direct)
					free(data);
			} else if (mpCompression == MP_COMPRESSION_ZSTD) {
				free(data);
			} else if (mpCompression == MP_COMPRESSION_I420ZSTD) {
				free(data);
			}
			return false;
		}
		pf.bufferFromBuffer(tmpBuf, pf, (const rdr::U8 *)buf, w, h, w, stride);
		buf = tmpBuf;
		stride = w;
	}

	/* stride is number of pixels per full image line,
	 * mpStride is number of bytes per line (usually stride * srcPixelsize),
	 * both can be more than what is used.
	 */
	if(stride != w || w != mpWidth || mpStride != w * srcPixelsize) {
		const int srcoffset = stride * srcPixelsize;
		const int targetoffset = mpStride;
		int x;
		int y;

//		vlog.verbose("wanted compression: %u, selected compression: %u, mpLevel: %u, w: %u, h: %u, wExtra: %u, hExtra: %u, stride: %u, mpStride: %u", mpCompressionWanted, mpCompression, mpLevel, w, h, wExtra ? 1 : 0, hExtra ? 1 : 0, stride, mpStride);
		if (!needI420) {
			if (mpPixelsize == srcPixelsize) {
				const int linesize = w * srcPixelsize;

#pragma GCC ivdep
				for (y = 0; y < h ; y++)
					memcpy(data + y * targetoffset, buf + y * srcoffset, linesize);
				if (wExtra) {
#pragma GCC ivdep
					for (y = 0; y < h ; y++)
						memcpy(data + y * targetoffset + w * mpPixelsize, data + y * targetoffset + (w - 1) * mpPixelsize, mpPixelsize);
				}
				if (hExtra)
					memcpy(data + h * targetoffset, data + (h - 1) * targetoffset, linesize + (wExtra ? mpPixelsize : 0));
			} else {
#pragma GCC ivdep
				for (y = 0; y < h ; y++) {
#pragma GCC ivdep
					for (x = 0; x < w ; x++)
						memcpy(data + targetoffset * y + mpPixelsize * x, buf + srcoffset * y + srcPixelsize * x, mpPixelsize);
				}
				if (wExtra) {
#pragma GCC ivdep
					for (y = 0; y < h ; y++)
						memcpy(data + targetoffset * y + mpPixelsize * w, data + targetoffset * y + mpPixelsize * (w - 1), mpPixelsize);
				}
				if (hExtra)
					memcpy(data + targetoffset * h, data + targetoffset * (h - 1), mpStride);
			}
		} else {
			rdr::U8 * ustart = data + mpWidth * mpHeight;
			rdr::U8 * vstart = ustart + mpWidth * mpHeight / 4 + (mpWidth & 1) * mpHeight + (mpHeight & 1) * mpWidth;
			const int uvlinesize = (mpWidth + 1) / 2;
#pragma GCC ivdep
			for (y = 0; y < h ; y++) {
				const int ysrcpos = srcoffset * y;
				const int ytargetpos = mpWidth * y;
#pragma GCC ivdep
				for (x = 0; x < w ; x++) {
					const int xsrcpos = ysrcpos + srcPixelsize * x;
					const int xtargetpos = ytargetpos + x;

					data[xtargetpos] = calcY(buf[xsrcpos], buf[xsrcpos+1], buf[xsrcpos+2]);
				}
			}
			if (wExtra) {
#pragma GCC ivdep
				for (y = 0; y < h ; y++) {
					const int xtargetpos = mpWidth * y + w;

					data[xtargetpos] = data[xtargetpos - 1];
				}
			}
#pragma GCC ivdep
			for (y = 0; y < h ; y += 2) {
				const int ysrcpos = srcoffset * y;
				const int ytargetpos = (y / 2) * uvlinesize;
#pragma GCC ivdep
				for (x = 0; x < w ; x += 2) {
					const int xsrcpos = ysrcpos + srcPixelsize * x;
					const int xtargetpos = ytargetpos + x / 2;

					ustart[xtargetpos] = calcU(buf[xsrcpos], buf[xsrcpos+1], buf[xsrcpos+2]);
					vstart[xtargetpos] = calcV(buf[xsrcpos], buf[xsrcpos+1], buf[xsrcpos+2]);
				}
			}
			if (wExtra) {
#pragma GCC ivdep
				for (y = 0; y < h ; y += 2) {
					const int xsrcpos = srcoffset * y + srcPixelsize * (w - 1);
					const int xtargetpos = (y / 2) * uvlinesize + w / 2;

					ustart[xtargetpos] = calcU(buf[xsrcpos], buf[xsrcpos+1], buf[xsrcpos+2]);
					vstart[xtargetpos] = calcV(buf[xsrcpos], buf[xsrcpos+1], buf[xsrcpos+2]);
				}
			}
			if (hExtra)
				memcpy(data + mpWidth * h, data + mpWidth * (h - 1), mpWidth);
		}
	} else {
		if (mpCompression == MP_COMPRESSION_RAW) {
//			vlog.verbose("direct use: wanted compression: %u, selected compression: %u, mpLevel: %u, w: %u, h: %u, stride: %u, mpStride: %u", mpCompressionWanted, mpCompression, mpLevel, w, h, stride, mpStride);
			if (direct) {
				memcpy(data, buf, dataSize);
			} else {
				free(data);
				data = (rdr::U8*) buf;
			}
		} else {
//			vlog.verbose("direct copy: wanted compression: %u, selected compression: %u, mpLevel: %u, w: %u, h: %u, stride: %u, mpStride: %u", mpCompressionWanted, mpCompression, mpLevel, w, h, stride, mpStride);
			if (needI420) {
				rgbToI420(data, (rdr::U8*) buf, w, h, srcPixelsize);
			} else {
				memcpy(data, buf, rectSize * srcPixelsize);
			}
		}
	}

	/* write out: body */
	if (mpCompression == MP_COMPRESSION_RAW) {
		if (direct) {
			reposition(length() + dataSize);
		} else {
			writeBytes(data, dataSize);
			if (data != buf)
				free(data);
		}
	} else if (mpCompression == MP_COMPRESSION_JPEG) {
		JSAMPROW *jpegRowPointer;

		jpegRowPointer = new JSAMPROW[mpHeight];
#pragma GCC ivdep
		for (int y = 0; y < mpHeight; y++)
			jpegRowPointer[y] = (JSAMPROW) data + y * mpStride;
		jpeg_start_compress(jpegCinfo, TRUE);
		while (jpegCinfo->next_scanline < jpegCinfo->image_height)
			jpeg_write_scanlines(jpegCinfo, &jpegRowPointer[jpegCinfo->next_scanline],
				jpegCinfo->image_height - jpegCinfo->next_scanline);
		jpeg_finish_compress(jpegCinfo);
		delete[] jpegRowPointer;
		free(data);
#if !defined(__APPLE__)
	} else if (mpCompression == MP_COMPRESSION_JPEGXL) {
		jpegxlStatus = JxlEncoderAddImageFrame(jpegxlFrameSettings,
			&jpegxlPixelFormat,
			(void*) data,
			dataSize);
		if (JXL_ENC_SUCCESS != jpegxlStatus) {
			vlog.error("JxlEncoderAddImageFrame() failed with error %i", JxlEncoderGetError(jpegxlEnc));
			free(data);
			if (tmpBuf)
				free(tmpBuf);
			return false;
		}
		JxlEncoderCloseInput(jpegxlEnc);
		uint8_t* nextOut = getptr(2048);
		uint8_t* startOut = nextOut;
		size_t availOut = avail();
		JxlEncoderStatus jpegxlProcessResult = JXL_ENC_NEED_MORE_OUTPUT;
		while (jpegxlProcessResult == JXL_ENC_NEED_MORE_OUTPUT) {
			jpegxlProcessResult = JxlEncoderProcessOutput(jpegxlEnc, &nextOut, &availOut);
			if (jpegxlProcessResult == JXL_ENC_NEED_MORE_OUTPUT) {
				size_t offset = nextOut - startOut;
				vlog.debug("JPEGXL: need more bytes at offset %lu, avail %lu, increasing", offset, availOut);
				startOut = getptr(2 * avail());
				nextOut = startOut + offset;
				availOut = avail() - offset;
				vlog.debug("JPEGXL: needed more bytes, increased to avail of %lu", availOut);
				if (availOut < 2048) {
					vlog.error("failed to increase buffer!");
					if (!direct)
						free(data);
					if (tmpBuf)
						free(tmpBuf);
					return false;
				}
			}
		}
		free(data);
		if (JXL_ENC_SUCCESS != jpegxlProcessResult) {
			vlog.error("JxlEncoderProcessOutput() failed with error %i", JxlEncoderGetError(jpegxlEnc));
			if (tmpBuf)
				free(tmpBuf);
			return false;
		}
		setptr(nextOut - startOut);
//		vlog.verbose("JPEGXL: done, added %lu bytes of output, length is %lu", nextOut - startOut, length());
#endif
	} else if (mpCompression == MP_COMPRESSION_H264) {
		SFrameBSInfo info;
		int rv;
		int i;
		int j;
		SLayerBSInfo* pLayer;
		int iCurLayerBits;

		/* prepare pic: 12 Bit / pixel: I420: 1(Y) + 1/4(U) + 1/4(V) */
		memset (&welsPic, 0, sizeof(welsPic));
		welsPic.iColorFormat = videoFormatI420;
		welsPic.pData[0] = data;
		welsPic.pData[1] = welsPic.pData[0] + mpWidth * mpHeight;
		welsPic.pData[2] = welsPic.pData[1] + mpWidth * mpHeight / 4 + (mpWidth & 1) * mpHeight + (mpHeight & 1) * mpWidth;
		welsPic.iPicWidth = mpWidth;
		welsPic.iPicHeight = mpHeight;
		welsPic.iStride[0] = welsPic.iPicWidth;
		welsPic.iStride[1] = welsPic.iStride[2] = welsPic.iPicWidth >> 1;

		/* encode */
		memset (&info, 0, sizeof(info));
		rv = welsEncoder->EncodeFrame(&welsPic, &info);
		free(data);
		if (rv == cmResultSuccess && info.eFrameType != videoFrameTypeInvalid && info.eFrameType != videoFrameTypeSkip && info.iLayerNum > 0) {
			for (i = 0; i < info.iLayerNum; i++) {
				pLayer = &info.sLayerInfo[i];
				iCurLayerBits = 0;
				for (j = 0; j < pLayer->iNalCount; j++) {
					iCurLayerBits += pLayer->pNalLengthInByte[j];
				}
				writeBytes(pLayer->pBsBuf, iCurLayerBits);
			}
		} else {
			vlog.error("EncodeFrame() failed with rv %i, info.eFrameType %i, info.iLayerNum %i", rv, info.eFrameType, info.iLayerNum);
			welsEncoder->Uninitialize();
			WelsDestroySVCEncoder(welsEncoder);
			if (tmpBuf)
				free(tmpBuf);
			return false;
		}
		welsEncoder->Uninitialize();
		WelsDestroySVCEncoder(welsEncoder);
	} else if (mpCompression == MP_COMPRESSION_RAWI420) {
		if (direct) {
			reposition(length() + dataSize);
		} else {
			writeBytes(data, dataSize);
			free(data);
		}
	} else if (mpCompression == MP_COMPRESSION_ZSTD) {
		const int compressedDataSize = ZSTD_compressBound(dataSize);
		int compressedSize;

		compressedSize = ZSTD_compressCCtx(zstdCctx, getptr(compressedDataSize), compressedDataSize, data, dataSize, VNCZstdLevel);
		if (compressedSize > 0) {
			reposition(length() + compressedSize);
		} else {
			vlog.error("ZSTD: ZSTD_compressCCtx() failed with error %i", compressedSize);
			free(data);
			if (tmpBuf)
				free(tmpBuf);
			return false;
		}
		free(data);
	} else if (mpCompression == MP_COMPRESSION_I420ZSTD) {
		const int compressedDataSize = ZSTD_compressBound(dataSize);
		int compressedSize;

		compressedSize = ZSTD_compressCCtx(zstdCctx, getptr(compressedDataSize), compressedDataSize, data, dataSize, VNCZstdLevel);
		if (compressedSize > 0) {
			reposition(length() + compressedSize);
		} else {
			vlog.error("I420+ZSTD: ZSTD_compressCctx() failed with error %i", compressedSize);
			free(data);
			if (tmpBuf)
				free(tmpBuf);
			return false;
		}
		free(data);
	}
	if (tmpBuf)
		free(tmpBuf);

	/* size check */
	const int usedSize = length() - startPos;
	if (   mpCompression != MP_COMPRESSION_RAWI420
	    && mpCompression != MP_COMPRESSION_RAW
	    && usedSize >= MP_COMPRESSION_MIN_OPT_SIZE) {
		if (wantOptSize && mpCompression != MP_COMPRESSION_I420ZSTD) {
			const int i420Size = i420DataSize(w, h) / 2;

			if (usedSize > i420Size + MP_COMPRESSION_MIN_SAVE_SIZE) {
				vlog.verbose("wantOptSize: mpCompression: %s, w: %u, h: %u, stride: %u, mpWidth: %u, mpStride: %u, usedSize %u > i420Size %u + %u, repeat with I420ZSTD", mpCompressionName(mpCompression), w, h, stride, mpWidth, mpStride, usedSize, i420Size, MP_COMPRESSION_MIN_SAVE_SIZE);
				reposition(headerStartPos);
				optimizationCount[MP_COMPRESSION_I420ZSTD]++;
				return compress(buf, stride, r, pf, mpLevel, MP_COMPRESSION_I420ZSTD);
			}
		} else {
			const int rawSize = rectSize * 4;

			if (rawSize < 80 || mpCompression == MP_COMPRESSION_ZSTD || mpCompression == MP_COMPRESSION_I420ZSTD) {
				if (usedSize > rawSize + MP_COMPRESSION_MIN_SAVE_SIZE) {
					vlog.verbose("optimize: mpCompression: %s, mpLevel: %u, w: %u, h: %u, stride: %u, mpWidth: %u, usedSize %u > rawSize %u + %u, repeat with RAW", mpCompressionName(mpCompression), mpLevel, w, h, stride, mpWidth, usedSize, rawSize, MP_COMPRESSION_MIN_SAVE_SIZE);
					reposition(headerStartPos);
					optimizationCount[MP_COMPRESSION_RAW]++;
					return compress(buf, stride, r, pf, mpLevel, MP_COMPRESSION_RAW);
				}
			} else {
				if (usedSize > rawSize / 2 + MP_COMPRESSION_MIN_SAVE_SIZE) {
					vlog.verbose("optimize: mpCompression: %s, mpLevel: %u, w: %u, h: %u, stride: %u, mpWidth: %u, usedSize %u > zstdEstSize %u + %u, repeat with ZSTD", mpCompressionName(mpCompression), mpLevel, w, h, stride, mpWidth, usedSize, rawSize / 2, MP_COMPRESSION_MIN_SAVE_SIZE);
					reposition(headerStartPos);
					optimizationCount[MP_COMPRESSION_ZSTD]++;
					return compress(buf, stride, r, pf, mpLevel, MP_COMPRESSION_ZSTD);
				}
			}
		}
	}

	if (vlog.getLevel() >= 100) {
		compressionCount[mpCompression]++;
		totalBytes[mpCompression] += mpWidth * mpHeight * srcPixelsize;
		totalCBytes[mpCompression] += usedSize;
		if (usedSize < minCompressed[mpCompression])
			minCompressed[mpCompression] = usedSize;
		if (usedSize > maxCompressed[mpCompression])
			maxCompressed[mpCompression] = usedSize;
	}
	return true;
}
#endif /* #if !defined(WIN32) */
