Last active
November 23, 2024 02:01
-
-
Save agyild/82219c545228d70c5604f865ce0b0ce5 to your computer and use it in GitHub Desktop.
AMD FidelityFX Super Resolution v1.0.2 for mpv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. | |
// | |
// Permission is hereby granted, free of charge, to any person obtaining a copy | |
// of this software and associated documentation files (the "Software"), to deal | |
// in the Software without restriction, including without limitation the rights | |
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
// copies of the Software, and to permit persons to whom the Software is | |
// furnished to do so, subject to the following conditions: | |
// | |
// The above copyright notice and this permission notice shall be included in | |
// all copies or substantial portions of the Software. | |
// | |
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
// THE SOFTWARE. | |
// FidelityFX FSR v1.0.2 by AMD | |
// ported to mpv by agyild | |
// Changelog | |
// Made it compatible with pre-OpenGL 4.0 renderers | |
// Made it directly operate on LUMA plane, since the original shader was operating on LUMA by deriving it from RGB. This should cause a major increase in performance, especially on OpenGL 4.0+ renderers (4+2 texture lookups vs. 12+5) | |
// Removed transparency preservation mechanism since the alpha channel is a separate source plane than LUMA | |
// Added optional performance-saving lossy optimizations to EASU (Credit: atyuwen, https://atyuwen.github.io/posts/optimizing-fsr/) | |
// | |
// Notes | |
// Per AMD's guidelines only upscales content up to 4x (e.g., 1080p -> 2160p, 720p -> 1440p etc.) and everything else in between, | |
// that means FSR will scale up to 4x at maximum, and any further scaling will be processed by mpv's scalers | |
//!HOOK LUMA | |
//!BIND HOOKED | |
//!SAVE EASUTEX | |
//!DESC FidelityFX Super Resolution v1.0.2 (EASU) | |
//!WHEN OUTPUT.w OUTPUT.h * LUMA.w LUMA.h * / 1.0 > | |
//!WIDTH OUTPUT.w OUTPUT.w LUMA.w 2 * < * LUMA.w 2 * OUTPUT.w LUMA.w 2 * > * + OUTPUT.w OUTPUT.w LUMA.w 2 * = * + | |
//!HEIGHT OUTPUT.h OUTPUT.h LUMA.h 2 * < * LUMA.h 2 * OUTPUT.h LUMA.h 2 * > * + OUTPUT.h OUTPUT.h LUMA.h 2 * = * + | |
//!COMPONENTS 1 | |
// User variables - EASU | |
#define FSR_PQ 0 // Whether the source content has PQ gamma or not. Needs to be set to the same value for both passes. 0 or 1. | |
#define FSR_EASU_DERING 1 // If set to 0, disables deringing for a small increase in performance. 0 or 1. | |
#define FSR_EASU_SIMPLE_ANALYSIS 0 // If set to 1, uses a simpler single-pass direction and length analysis for an increase in performance. 0 or 1. | |
#define FSR_EASU_QUIT_EARLY 0 // If set to 1, uses bilinear filtering for non-edge pixels and skips EASU on those regions for an increase in performance. 0 or 1. | |
// Shader code | |
#ifndef FSR_EASU_DIR_THRESHOLD | |
#if (FSR_EASU_QUIT_EARLY == 1) | |
#define FSR_EASU_DIR_THRESHOLD 64.0 | |
#elif (FSR_EASU_QUIT_EARLY == 0) | |
#define FSR_EASU_DIR_THRESHOLD 32768.0 | |
#endif | |
#endif | |
float APrxLoRcpF1(float a) { | |
return uintBitsToFloat(uint(0x7ef07ebb) - floatBitsToUint(a)); | |
} | |
float APrxLoRsqF1(float a) { | |
return uintBitsToFloat(uint(0x5f347d74) - (floatBitsToUint(a) >> uint(1))); | |
} | |
float AMin3F1(float x, float y, float z) { | |
return min(x, min(y, z)); | |
} | |
float AMax3F1(float x, float y, float z) { | |
return max(x, max(y, z)); | |
} | |
#if (FSR_PQ == 1) | |
float ToGamma2(float a) { | |
return pow(a, 4.0); | |
} | |
#endif | |
// Filtering for a given tap for the scalar. | |
void FsrEasuTap( | |
inout float aC, // Accumulated color, with negative lobe. | |
inout float aW, // Accumulated weight. | |
vec2 off, // Pixel offset from resolve position to tap. | |
vec2 dir, // Gradient direction. | |
vec2 len, // Length. | |
float lob, // Negative lobe strength. | |
float clp, // Clipping point. | |
float c){ // Tap color. | |
// Rotate offset by direction. | |
vec2 v; | |
v.x = (off.x * ( dir.x)) + (off.y * dir.y); | |
v.y = (off.x * (-dir.y)) + (off.y * dir.x); | |
// Anisotropy. | |
v *= len; | |
// Compute distance^2. | |
float d2 = v.x * v.x + v.y * v.y; | |
// Limit to the window as at corner, 2 taps can easily be outside. | |
d2 = min(d2, clp); | |
// Approximation of lancos2 without sin() or rcp(), or sqrt() to get x. | |
// (25/16 * (2/5 * x^2 - 1)^2 - (25/16 - 1)) * (1/4 * x^2 - 1)^2 | |
// |_______________________________________| |_______________| | |
// base window | |
// The general form of the 'base' is, | |
// (a*(b*x^2-1)^2-(a-1)) | |
// Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe. | |
float wB = float(2.0 / 5.0) * d2 + -1.0; | |
float wA = lob * d2 + -1.0; | |
wB *= wB; | |
wA *= wA; | |
wB = float(25.0 / 16.0) * wB + float(-(25.0 / 16.0 - 1.0)); | |
float w = wB * wA; | |
// Do weighted average. | |
aC += c * w; | |
aW += w; | |
} | |
// Accumulate direction and length. | |
void FsrEasuSet( | |
inout vec2 dir, | |
inout float len, | |
vec2 pp, | |
#if (FSR_EASU_SIMPLE_ANALYSIS == 1) | |
float b, float c, | |
float i, float j, float f, float e, | |
float k, float l, float h, float g, | |
float o, float n | |
#elif (FSR_EASU_SIMPLE_ANALYSIS == 0) | |
bool biS, bool biT, bool biU, bool biV, | |
float lA, float lB, float lC, float lD, float lE | |
#endif | |
){ | |
// Compute bilinear weight, branches factor out as predicates are compiler time immediates. | |
// s t | |
// u v | |
#if (FSR_EASU_SIMPLE_ANALYSIS == 1) | |
vec4 w = vec4(0.0); | |
w.x = (1.0 - pp.x) * (1.0 - pp.y); | |
w.y = pp.x * (1.0 - pp.y); | |
w.z = (1.0 - pp.x) * pp.y; | |
w.w = pp.x * pp.y; | |
float lA = dot(w, vec4(b, c, f, g)); | |
float lB = dot(w, vec4(e, f, i, j)); | |
float lC = dot(w, vec4(f, g, j, k)); | |
float lD = dot(w, vec4(g, h, k, l)); | |
float lE = dot(w, vec4(j, k, n, o)); | |
#elif (FSR_EASU_SIMPLE_ANALYSIS == 0) | |
float w = 0.0; | |
if (biS) | |
w = (1.0 - pp.x) * (1.0 - pp.y); | |
if (biT) | |
w = pp.x * (1.0 - pp.y); | |
if (biU) | |
w = (1.0 - pp.x) * pp.y; | |
if (biV) | |
w = pp.x * pp.y; | |
#endif | |
// Direction is the '+' diff. | |
// a | |
// b c d | |
// e | |
// Then takes magnitude from abs average of both sides of 'c'. | |
// Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms. | |
float dc = lD - lC; | |
float cb = lC - lB; | |
float lenX = max(abs(dc), abs(cb)); | |
lenX = APrxLoRcpF1(lenX); | |
float dirX = lD - lB; | |
lenX = clamp(abs(dirX) * lenX, 0.0, 1.0); | |
lenX *= lenX; | |
// Repeat for the y axis. | |
float ec = lE - lC; | |
float ca = lC - lA; | |
float lenY = max(abs(ec), abs(ca)); | |
lenY = APrxLoRcpF1(lenY); | |
float dirY = lE - lA; | |
lenY = clamp(abs(dirY) * lenY, 0.0, 1.0); | |
lenY *= lenY; | |
#if (FSR_EASU_SIMPLE_ANALYSIS == 1) | |
len = lenX + lenY; | |
dir = vec2(dirX, dirY); | |
#elif (FSR_EASU_SIMPLE_ANALYSIS == 0) | |
dir += vec2(dirX, dirY) * w; | |
len += dot(vec2(w), vec2(lenX, lenY)); | |
#endif | |
} | |
vec4 hook() { | |
// Result | |
vec4 pix = vec4(0.0, 0.0, 0.0, 1.0); | |
//------------------------------------------------------------------------------------------------------------------------------ | |
// +---+---+ | |
// | | | | |
// +--(0)--+ | |
// | b | c | | |
// +---F---+---+---+ | |
// | e | f | g | h | | |
// +--(1)--+--(2)--+ | |
// | i | j | k | l | | |
// +---+---+---+---+ | |
// | n | o | | |
// +--(3)--+ | |
// | | | | |
// +---+---+ | |
// Get position of 'F'. | |
vec2 pp = HOOKED_pos * HOOKED_size - vec2(0.5); | |
vec2 fp = floor(pp); | |
pp -= fp; | |
//------------------------------------------------------------------------------------------------------------------------------ | |
// 12-tap kernel. | |
// b c | |
// e f g h | |
// i j k l | |
// n o | |
// Gather 4 ordering. | |
// a b | |
// r g | |
// Allowing dead-code removal to remove the 'z's. | |
#if (defined(HOOKED_gather) && (__VERSION__ >= 400 || (GL_ES && __VERSION__ >= 310))) | |
vec4 bczzL = HOOKED_gather(vec2((fp + vec2(1.0, -1.0)) * HOOKED_pt), 0); | |
vec4 ijfeL = HOOKED_gather(vec2((fp + vec2(0.0, 1.0)) * HOOKED_pt), 0); | |
vec4 klhgL = HOOKED_gather(vec2((fp + vec2(2.0, 1.0)) * HOOKED_pt), 0); | |
vec4 zzonL = HOOKED_gather(vec2((fp + vec2(1.0, 3.0)) * HOOKED_pt), 0); | |
#else | |
// pre-OpenGL 4.0 compatibility | |
float b = HOOKED_tex(vec2((fp + vec2(0.5, -0.5)) * HOOKED_pt)).r; | |
float c = HOOKED_tex(vec2((fp + vec2(1.5, -0.5)) * HOOKED_pt)).r; | |
float e = HOOKED_tex(vec2((fp + vec2(-0.5, 0.5)) * HOOKED_pt)).r; | |
float f = HOOKED_tex(vec2((fp + vec2( 0.5, 0.5)) * HOOKED_pt)).r; | |
float g = HOOKED_tex(vec2((fp + vec2( 1.5, 0.5)) * HOOKED_pt)).r; | |
float h = HOOKED_tex(vec2((fp + vec2( 2.5, 0.5)) * HOOKED_pt)).r; | |
float i = HOOKED_tex(vec2((fp + vec2(-0.5, 1.5)) * HOOKED_pt)).r; | |
float j = HOOKED_tex(vec2((fp + vec2( 0.5, 1.5)) * HOOKED_pt)).r; | |
float k = HOOKED_tex(vec2((fp + vec2( 1.5, 1.5)) * HOOKED_pt)).r; | |
float l = HOOKED_tex(vec2((fp + vec2( 2.5, 1.5)) * HOOKED_pt)).r; | |
float n = HOOKED_tex(vec2((fp + vec2(0.5, 2.5) ) * HOOKED_pt)).r; | |
float o = HOOKED_tex(vec2((fp + vec2(1.5, 2.5) ) * HOOKED_pt)).r; | |
vec4 bczzL = vec4(b, c, 0.0, 0.0); | |
vec4 ijfeL = vec4(i, j, f, e); | |
vec4 klhgL = vec4(k, l, h, g); | |
vec4 zzonL = vec4(0.0, 0.0, o, n); | |
#endif | |
//------------------------------------------------------------------------------------------------------------------------------ | |
// Rename. | |
float bL = bczzL.x; | |
float cL = bczzL.y; | |
float iL = ijfeL.x; | |
float jL = ijfeL.y; | |
float fL = ijfeL.z; | |
float eL = ijfeL.w; | |
float kL = klhgL.x; | |
float lL = klhgL.y; | |
float hL = klhgL.z; | |
float gL = klhgL.w; | |
float oL = zzonL.z; | |
float nL = zzonL.w; | |
#if (FSR_PQ == 1) | |
// Not the most performance-friendly solution, but should work until mpv adds proper gamma transformation functions for shaders | |
bL = ToGamma2(bL); | |
cL = ToGamma2(cL); | |
iL = ToGamma2(iL); | |
jL = ToGamma2(jL); | |
fL = ToGamma2(fL); | |
eL = ToGamma2(eL); | |
kL = ToGamma2(kL); | |
lL = ToGamma2(lL); | |
hL = ToGamma2(hL); | |
gL = ToGamma2(gL); | |
oL = ToGamma2(oL); | |
nL = ToGamma2(nL); | |
#endif | |
// Accumulate for bilinear interpolation. | |
vec2 dir = vec2(0.0); | |
float len = 0.0; | |
#if (FSR_EASU_SIMPLE_ANALYSIS == 1) | |
FsrEasuSet(dir, len, pp, bL, cL, iL, jL, fL, eL, kL, lL, hL, gL, oL, nL); | |
#elif (FSR_EASU_SIMPLE_ANALYSIS == 0) | |
FsrEasuSet(dir, len, pp, true, false, false, false, bL, eL, fL, gL, jL); | |
FsrEasuSet(dir, len, pp, false, true, false, false, cL, fL, gL, hL, kL); | |
FsrEasuSet(dir, len, pp, false, false, true, false, fL, iL, jL, kL, nL); | |
FsrEasuSet(dir, len, pp, false, false, false, true, gL, jL, kL, lL, oL); | |
#endif | |
//------------------------------------------------------------------------------------------------------------------------------ | |
// Normalize with approximation, and cleanup close to zero. | |
vec2 dir2 = dir * dir; | |
float dirR = dir2.x + dir2.y; | |
bool zro = dirR < float(1.0 / FSR_EASU_DIR_THRESHOLD); | |
dirR = APrxLoRsqF1(dirR); | |
#if (FSR_EASU_QUIT_EARLY == 1) | |
if (zro) { | |
vec4 w = vec4(0.0); | |
w.x = (1.0 - pp.x) * (1.0 - pp.y); | |
w.y = pp.x * (1.0 - pp.y); | |
w.z = (1.0 - pp.x) * pp.y; | |
w.w = pp.x * pp.y; | |
pix.r = clamp(dot(w, vec4(fL, gL, jL, kL)), 0.0, 1.0); | |
return pix; | |
} | |
#elif (FSR_EASU_QUIT_EARLY == 0) | |
dirR = zro ? 1.0 : dirR; | |
dir.x = zro ? 1.0 : dir.x; | |
#endif | |
dir *= vec2(dirR); | |
// Transform from {0 to 2} to {0 to 1} range, and shape with square. | |
len = len * 0.5; | |
len *= len; | |
// Stretch kernel {1.0 vert|horz, to sqrt(2.0) on diagonal}. | |
float stretch = (dir.x * dir.x + dir.y * dir.y) * APrxLoRcpF1(max(abs(dir.x), abs(dir.y))); | |
// Anisotropic length after rotation, | |
// x := 1.0 lerp to 'stretch' on edges | |
// y := 1.0 lerp to 2x on edges | |
vec2 len2 = vec2(1.0 + (stretch - 1.0) * len, 1.0 + -0.5 * len); | |
// Based on the amount of 'edge', | |
// the window shifts from +/-{sqrt(2.0) to slightly beyond 2.0}. | |
float lob = 0.5 + float((1.0 / 4.0 - 0.04) - 0.5) * len; | |
// Set distance^2 clipping point to the end of the adjustable window. | |
float clp = APrxLoRcpF1(lob); | |
//------------------------------------------------------------------------------------------------------------------------------ | |
// Accumulation | |
// b c | |
// e f g h | |
// i j k l | |
// n o | |
float aC = 0.0; | |
float aW = 0.0; | |
FsrEasuTap(aC, aW, vec2( 0.0,-1.0) - pp, dir, len2, lob, clp, bL); // b | |
FsrEasuTap(aC, aW, vec2( 1.0,-1.0) - pp, dir, len2, lob, clp, cL); // c | |
FsrEasuTap(aC, aW, vec2(-1.0, 1.0) - pp, dir, len2, lob, clp, iL); // i | |
FsrEasuTap(aC, aW, vec2( 0.0, 1.0) - pp, dir, len2, lob, clp, jL); // j | |
FsrEasuTap(aC, aW, vec2( 0.0, 0.0) - pp, dir, len2, lob, clp, fL); // f | |
FsrEasuTap(aC, aW, vec2(-1.0, 0.0) - pp, dir, len2, lob, clp, eL); // e | |
FsrEasuTap(aC, aW, vec2( 1.0, 1.0) - pp, dir, len2, lob, clp, kL); // k | |
FsrEasuTap(aC, aW, vec2( 2.0, 1.0) - pp, dir, len2, lob, clp, lL); // l | |
FsrEasuTap(aC, aW, vec2( 2.0, 0.0) - pp, dir, len2, lob, clp, hL); // h | |
FsrEasuTap(aC, aW, vec2( 1.0, 0.0) - pp, dir, len2, lob, clp, gL); // g | |
FsrEasuTap(aC, aW, vec2( 1.0, 2.0) - pp, dir, len2, lob, clp, oL); // o | |
FsrEasuTap(aC, aW, vec2( 0.0, 2.0) - pp, dir, len2, lob, clp, nL); // n | |
//------------------------------------------------------------------------------------------------------------------------------ | |
// Normalize and dering. | |
pix.r = aC / aW; | |
#if (FSR_EASU_DERING == 1) | |
float min1 = min(AMin3F1(fL, gL, jL), kL); | |
float max1 = max(AMax3F1(fL, gL, jL), kL); | |
pix.r = clamp(pix.r, min1, max1); | |
#endif | |
pix.r = clamp(pix.r, 0.0, 1.0); | |
return pix; | |
} | |
//!HOOK LUMA | |
//!BIND EASUTEX | |
//!DESC FidelityFX Super Resolution v1.0.2 (RCAS) | |
//!WIDTH EASUTEX.w | |
//!HEIGHT EASUTEX.h | |
//!COMPONENTS 1 | |
// User variables - RCAS | |
#define SHARPNESS 0.2 // Controls the amount of sharpening. The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. 0.0 to 2.0. | |
#define FSR_RCAS_DENOISE 1 // If set to 1, lessens the sharpening on noisy areas. Can be disabled for better performance. 0 or 1. | |
#define FSR_PQ 0 // Whether the source content has PQ gamma or not. Needs to be set to the same value for both passes. 0 or 1. | |
// Shader code | |
#define FSR_RCAS_LIMIT (0.25 - (1.0 / 16.0)) // This is set at the limit of providing unnatural results for sharpening. | |
float APrxMedRcpF1(float a) { | |
float b = uintBitsToFloat(uint(0x7ef19fff) - floatBitsToUint(a)); | |
return b * (-b * a + 2.0); | |
} | |
float AMax3F1(float x, float y, float z) { | |
return max(x, max(y, z)); | |
} | |
float AMin3F1(float x, float y, float z) { | |
return min(x, min(y, z)); | |
} | |
#if (FSR_PQ == 1) | |
float FromGamma2(float a) { | |
return sqrt(sqrt(a)); | |
} | |
#endif | |
vec4 hook() { | |
// Algorithm uses minimal 3x3 pixel neighborhood. | |
// b | |
// d e f | |
// h | |
#if (defined(EASUTEX_gather) && (__VERSION__ >= 400 || (GL_ES && __VERSION__ >= 310))) | |
vec3 bde = EASUTEX_gather(EASUTEX_pos + EASUTEX_pt * vec2(-0.5), 0).xyz; | |
float b = bde.z; | |
float d = bde.x; | |
float e = bde.y; | |
vec2 fh = EASUTEX_gather(EASUTEX_pos + EASUTEX_pt * vec2(0.5), 0).zx; | |
float f = fh.x; | |
float h = fh.y; | |
#else | |
float b = EASUTEX_texOff(vec2( 0.0, -1.0)).r; | |
float d = EASUTEX_texOff(vec2(-1.0, 0.0)).r; | |
float e = EASUTEX_tex(EASUTEX_pos).r; | |
float f = EASUTEX_texOff(vec2(1.0, 0.0)).r; | |
float h = EASUTEX_texOff(vec2(0.0, 1.0)).r; | |
#endif | |
// Min and max of ring. | |
float mn1L = min(AMin3F1(b, d, f), h); | |
float mx1L = max(AMax3F1(b, d, f), h); | |
// Immediate constants for peak range. | |
vec2 peakC = vec2(1.0, -1.0 * 4.0); | |
// Limiters, these need to be high precision RCPs. | |
float hitMinL = min(mn1L, e) / (4.0 * mx1L); | |
float hitMaxL = (peakC.x - max(mx1L, e)) / (4.0 * mn1L + peakC.y); | |
float lobeL = max(-hitMinL, hitMaxL); | |
float lobe = max(float(-FSR_RCAS_LIMIT), min(lobeL, 0.0)) * exp2(-clamp(float(SHARPNESS), 0.0, 2.0)); | |
// Apply noise removal. | |
#if (FSR_RCAS_DENOISE == 1) | |
// Noise detection. | |
float nz = 0.25 * b + 0.25 * d + 0.25 * f + 0.25 * h - e; | |
nz = clamp(abs(nz) * APrxMedRcpF1(AMax3F1(AMax3F1(b, d, e), f, h) - AMin3F1(AMin3F1(b, d, e), f, h)), 0.0, 1.0); | |
nz = -0.5 * nz + 1.0; | |
lobe *= nz; | |
#endif | |
// Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. | |
float rcpL = APrxMedRcpF1(4.0 * lobe + 1.0); | |
vec4 pix = vec4(0.0, 0.0, 0.0, 1.0); | |
pix.r = float((lobe * b + lobe * d + lobe * h + lobe * f + e) * rcpL); | |
#if (FSR_PQ == 1) | |
pix.r = FromGamma2(pix.r); | |
#endif | |
return pix; | |
} |
Why would they make ffs2 requirements that high you can't even upscale a video.. 😒?
Requirements are not high, the underlying technologies are different. High-fidelity upscaling algorithms such as FSR2+ and DLSS are developed for use in video games, the fact that FSR1 and NIS can be used for video playback in the first place is simply a byproduct of their design or a hack.
Video playback does not include motion buffers because each individual scene is prebaked into the video stream. Video games have it because game engines create each scene via real-time rendering. Correspondingly, FSR2+ and DLSS won't be able to upscale pre-rendered in-game video content (e.g., cutscenes, TV screens, etc.) either.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Ah I see now. I thought it was about ffs3. But ffs2 was also mentioned as impossible. Donkers.
Why would they make ffs2 requirements that high you can't even upscale a video.. 😒?