rygorous/gist:bdec6f2f3b193f6e0db54cc852b433c3

## gistfile1.txt
	// ep1 - ep0 >= 0: four-interpolated color mode
	// ep1 - ep0 < 0: six-interpolated-color mode
	//
	// we want to avoid ep0 = ep1 in general (because it gives us no useful
	// interpolated values). That means in four-interp mode we want ep1 - ep0 > 0,
	// and in six-interp mode we want ep1 - ep0 < 0. If they're identical or
	// have the wrong sign, we need to fix it!
	int ep_diff = ((ep1_int - ep0_int) ^ target_sign) - target_sign;
	if (ep_diff <= 0)
	{
		// Degenerate case is hit in three circumstances:
		// 1. the original LLS system was singular. In that case, init_index_cache
		//    sets us up with a linear system that ensures we end up here.
		// 2. the LLS solution gave two endpoints that are close enough together to
		//    quantize to the same number.
		// 3. the LLS solution gave us two endpoints that are ordered so they don't
		//    give the mode we want to hit.
		//
		// The third case is the most interesting one: it means the linear inequality
		// constraint implied by the endpoint ordering is active. The non-degenerate
		// cases (i.e. we never allow ep0==ep1) are:
		//
		//   Four-interp: ep0 + 1 <= ep1
		//   Six-interp:  ep1 + 1 <= ep0
		//
		// This is a quadratic problem and the constraint volume implied by these linear
		// inequalities is just a half-space. If the constraint is active, that means
		// the minimum is attained on the boundary, i.e. on the ep0 + 1 = ep1 (or
		// respectively ep1 + 1 = ep0) line.
		//
		// So we need to find the values for ep0 and ep1 that are 1 apart from each
		// other, in the right mode, and minimize the quadratic error. This least-squares
		// problem is (here the version for four-interp)
		//
		// min  ||    [ e ]            ||^2
		//  e   ||A * [e+1] - scale*k*b||
		//
		// where scale = max_quant / max_dequant and the nonzero rows of A all sum
		// to k, the denominator of the interpolation weights (constant per mode).
		// Split A into its two columns A=[a_0 a_1], then a_0 + a_1 = k (in the
		// rows that are not zero) and this simplifies to
		//
		// min ||k * e + a_1 - scale*k*b||^2
		//  e
		//
		// Dividing by k yields
		//
		// min ||e - (scale*b - (1/k)*a_1)||^2
		//  e
		//
		// which is clearly minimized when e = mean(scale*b - a_1/k) = (scale/k)*mean(k*b) - mean(a_1)/k
		// ABsum is sum(k*b), -mean(a_1)/k can be precomputed in the index cache (this is degen_bias).
		//
		// Six-interp mode works the same way but degen_bias has the opposite sign.
		F32 mean = static_cast<F32>(ABsum) * ic->inv_ata[3] + ic->degen_bias;

		ep0_int = rr_round_to_int(mean);

		if (target_sign == 0)
		{
			ep0_int = RR_CLAMP(ep0_int, info.enc.min_q, info.enc.max_q - 1);
			ep1_int = ep0_int + 1;
		}
		else
		{
			ep0_int = RR_CLAMP(ep0_int, info.enc.min_q + 1, info.enc.max_q);
			ep1_int = ep0_int - 1;

			// and if that violates our constraints, we need to enforce them again,
			// no matter how much it hurts
			if ( blki.constraints.mask & 0x0000ffffu ) ep1_int = info.enc.min_q;
			if ( blki.constraints.mask & 0xffff0000u ) ep0_int = info.enc.max_q;
		}

		RR_ASSERT(ep0_int >= info.enc.min_q && ep0_int <= info.enc.max_q);
		RR_ASSERT(ep1_int >= info.enc.min_q && ep1_int <= info.enc.max_q);
		RR_ASSERT(((ep1_int - ep0_int) ^ target_sign) >= 0);
	}
	// ep1 - ep0 >= 0: four-interpolated color mode
	// ep1 - ep0 < 0: six-interpolated-color mode
	//
	// we want to avoid ep0 = ep1 in general (because it gives us no useful
	// interpolated values). That means in four-interp mode we want ep1 - ep0 > 0,
	// and in six-interp mode we want ep1 - ep0 < 0. If they're identical or
	// have the wrong sign, we need to fix it!
	int ep_diff = ((ep1_int - ep0_int) ^ target_sign) - target_sign;
	if (ep_diff <= 0)
	{
	// Degenerate case is hit in three circumstances:
	// 1. the original LLS system was singular. In that case, init_index_cache
	// sets us up with a linear system that ensures we end up here.
	// 2. the LLS solution gave two endpoints that are close enough together to
	// quantize to the same number.
	// 3. the LLS solution gave us two endpoints that are ordered so they don't
	// give the mode we want to hit.
	//
	// The third case is the most interesting one: it means the linear inequality
	// constraint implied by the endpoint ordering is active. The non-degenerate
	// cases (i.e. we never allow ep0==ep1) are:
	//
	// Four-interp: ep0 + 1 <= ep1
	// Six-interp: ep1 + 1 <= ep0
	//
	// This is a quadratic problem and the constraint volume implied by these linear
	// inequalities is just a half-space. If the constraint is active, that means
	// the minimum is attained on the boundary, i.e. on the ep0 + 1 = ep1 (or
	// respectively ep1 + 1 = ep0) line.
	//
	// So we need to find the values for ep0 and ep1 that are 1 apart from each
	// other, in the right mode, and minimize the quadratic error. This least-squares
	// problem is (here the version for four-interp)
	//
	// min \|\| [ e ] \|\|^2
	// e \|\|A * [e+1] - scalekb\|\|
	//
	// where scale = max_quant / max_dequant and the nonzero rows of A all sum
	// to k, the denominator of the interpolation weights (constant per mode).
	// Split A into its two columns A=[a_0 a_1], then a_0 + a_1 = k (in the
	// rows that are not zero) and this simplifies to
	//
	// min \|\|k * e + a_1 - scalekb\|\|^2
	// e
	//
	// Dividing by k yields
	//
	// min \|\|e - (scaleb - (1/k)a_1)\|\|^2
	// e
	//
	// which is clearly minimized when e = mean(scaleb - a_1/k) = (scale/k)mean(k*b) - mean(a_1)/k
	// ABsum is sum(k*b), -mean(a_1)/k can be precomputed in the index cache (this is degen_bias).
	//
	// Six-interp mode works the same way but degen_bias has the opposite sign.
	F32 mean = static_cast<F32>(ABsum) * ic->inv_ata[3] + ic->degen_bias;

	ep0_int = rr_round_to_int(mean);

	if (target_sign == 0)
	{
	ep0_int = RR_CLAMP(ep0_int, info.enc.min_q, info.enc.max_q - 1);
	ep1_int = ep0_int + 1;
	}
	else
	{
	ep0_int = RR_CLAMP(ep0_int, info.enc.min_q + 1, info.enc.max_q);
	ep1_int = ep0_int - 1;

	// and if that violates our constraints, we need to enforce them again,
	// no matter how much it hurts
	if ( blki.constraints.mask & 0x0000ffffu ) ep1_int = info.enc.min_q;
	if ( blki.constraints.mask & 0xffff0000u ) ep0_int = info.enc.max_q;
	}

	RR_ASSERT(ep0_int >= info.enc.min_q && ep0_int <= info.enc.max_q);
	RR_ASSERT(ep1_int >= info.enc.min_q && ep1_int <= info.enc.max_q);
	RR_ASSERT(((ep1_int - ep0_int) ^ target_sign) >= 0);
	}