pounard/normalize-path.php

## normalize-path.php
<?php

// Generate test data for performance test
$testData = [];
$words = ['..', 'test', 'pouet', '/', '..', '', 'usr', 'bin', 'firefox', 'do not', 'panic', 'secret', 'perso', 'private', 'foo', '/', '..', '', 'hyper long word', 'word with space', 'some56457', '@ert', '.ezrzer', 'cassoulet', '/', '..', ''];
for ($i = 0; $i < 100; ++$i) {
    $value = [];
    $count = rand(0, 20);
    for ($j = 1; $j < $count; ++$j) {
        $value[] = $words[rand(0, count($words) - 1)];
    }
    $value = implode('/', $value);
    if (rand(0, 4) < 1) {
        $value = '/'.$value;
    }
    if (rand(0, 4) < 1) {
        $value = $value . '/';
    }
    $testData[] = $value;
}
print_r($testData);

$test = [
    // Tests with '..'
    'a/b/..' => 'a',
    'https://a/b/../' => 'https://a',
    '/a/b/c/d/../e/f' => '/a/b/c/e/f',
    'a/b/c/../../e/f' => 'a/e/f',
    'ftp://a/../b/../c/../e/f' => 'ftp://e/f',
    'a../b/c../d..e/' => 'a../b/c../d..e',
    '../c/d' => '../c/d',
    // With multiple '/'
    '/a/b/////c/d/../e/f' => '/a/b/c/e/f',
    'file:////a/b/c//../..//e/f' => 'file:///a/e/f',
    '////a/../b/../c//../e/f' => '/e/f',
    'a../b//c../d..e/' => 'a../b/c../d..e',
    '../c////d' => '../c/d',
    // With dots
    'a/b/./././..' => 'a',
    'a/.b/./../' => 'a',
    '/a/b/.c/d/../e/f' => '/a/b/.c/e/f',
    '.a/./b/c/.././../e./f' => '.a/e./f',
    // Special cases
    '/' => '/',
    '.' => '.',
    '..' => '..',
    '/..' => '..', // Invalid
    './' => '.',
    '../' => '..',
    '/.' => '/',
];

// preg_replace() based method, I'm surprised this is actually twice faster than
// the array_splice() based method.
function normalizePath($string)
{
    // Handle windows gracefully
    if (\DIRECTORY_SEPARATOR !== '/') {
        $string = \str_replace(\DIRECTORY_SEPARATOR, '/', $string);
    }
    // Also tests some special cases we can't really do anything with
    if (false === \strpos($string, '/') || '/' === $string || '.' === $string || '..' === $string) {
        return $string;
    }
    // This is supposedly invalid, but an empty string is an empty string
    if ('' === ($string = \rtrim($string, '/'))) {
        return '';
    }

    $scheme = null;
    if (\strpos($string, '://')) {
        list($scheme, $string) = \explode('://', $string, 2);
    }

    // Matches useless '.' repetitions
    $string = \preg_replace('@^\./|(/\.)+/|/\.$@', '/', $string);

    $count = 0;
    do {
        // string such as '//' can be generated by the first regex, hence the second
        $string = \preg_replace('@[^/]+/+\.\.(/+|$)@', '$2', \preg_replace('@//+@', '/', $string), -1, $count);
    } while ($count);

    // rtrim() a second time because preg_replace() could leave a trailing '/'
    return ($scheme ? ($scheme.'://') : '').\rtrim($string, '/');
}

// array_splice() version, this is actually more or less the same implementation
// as python's own os.path.normpath() function, used by os.path.abspath() method
// which is itself in use in os.path.realpath() method.
function normalizePath2($string)
{
    // Handle windows gracefully
    if (\DIRECTORY_SEPARATOR !== '/') {
        $string = \str_replace(\DIRECTORY_SEPARATOR, '/', $string);
    }
    // Also tests some special cases we can't really do anything with
    if (false === \strpos($string, '/') || '/' === $string || '.' === $string || '..' === $string) {
        return $string;
    }
    if ('' === ($string = rtrim($string, '/'))) {
        return '';
    }

    $scheme = null;
    if (strpos($string, '://')) {
        list($scheme, $string) = explode('://', $string, 2);
    }

    $segments = explode('/', $string);
    $absolute = '/' === $string[0];
    $start = 0;

    do {
        $continue = false;
        $length = count($segments);

        for ($i = $start; $i < $length; ++$i) {
            $value = $segments[$i];
            if ('' === $value || '.' === $value) { // No-op values
                $start = $i;
                array_splice($segments, $i, 1);
                $continue = true; break;
            }
            if ('..' === $value && $i > 0) { // Back in hierarchy, drop previous
                array_splice($segments, $i - 1, 2);
                $start = $i == 1 ? 0 : $i - 2;
                $continue = true; break;
            }
        }
    } while ($continue);

    return ($absolute ? '/' : ($scheme ? ($scheme.'://') : '')).implode('/', $segments);
}

echo "\n\nnormalizePath()\n";
foreach ($test as $string => $expected) {
    $ret = normalizePath($string);
    if ($ret === $expected) {
        echo "OK $string -> $ret\n";
    } else {
        echo "FAIL $string -> $ret (expected: $expected)\n";
    }
}

echo "\n\nnormalizePath2()\n";
foreach ($test as $string => $expected) {
    $ret = normalizePath2($string);
    if ($ret === $expected) {
        echo "OK $string -> $ret\n";
    } else {
        echo "FAIL $string -> $ret (expected: $expected)\n";
    }
}

echo "\n\nnormalizePath() performance... ";
$time = microtime(true);
for ($i = 0; $i < 1000; ++$i) {
    foreach ($testData as $string) {
        $ret1 = normalizePath($string);
    }
}
echo (microtime(true) - $time)," sec for 100,000 calls\n";

echo "\n\nnormalizePath2() performance... ";
$time = microtime(true);
for ($i = 0; $i < 1000; ++$i) {
    foreach ($testData as $string) {
        $ret1 = normalizePath2($string);
    }
}
echo (microtime(true) - $time)," sec for 100,000 calls\n";
	<?php

	// Generate test data for performance test
	$testData = [];
	$words = ['..', 'test', 'pouet', '/', '..', '', 'usr', 'bin', 'firefox', 'do not', 'panic', 'secret', 'perso', 'private', 'foo', '/', '..', '', 'hyper long word', 'word with space', 'some56457', '@ert', '.ezrzer', 'cassoulet', '/', '..', ''];
	for ($i = 0; $i < 100; ++$i) {
	$value = [];
	$count = rand(0, 20);
	for ($j = 1; $j < $count; ++$j) {
	$value[] = $words[rand(0, count($words) - 1)];
	}
	$value = implode('/', $value);
	if (rand(0, 4) < 1) {
	$value = '/'.$value;
	}
	if (rand(0, 4) < 1) {
	$value = $value . '/';
	}
	$testData[] = $value;
	}
	print_r($testData);

	$test = [
	// Tests with '..'
	'a/b/..' => 'a',
	'https://a/b/../' => 'https://a',
	'/a/b/c/d/../e/f' => '/a/b/c/e/f',
	'a/b/c/../../e/f' => 'a/e/f',
	'ftp://a/../b/../c/../e/f' => 'ftp://e/f',
	'a../b/c../d..e/' => 'a../b/c../d..e',
	'../c/d' => '../c/d',
	// With multiple '/'
	'/a/b/////c/d/../e/f' => '/a/b/c/e/f',
	'file:////a/b/c//../..//e/f' => 'file:///a/e/f',
	'////a/../b/../c//../e/f' => '/e/f',
	'a../b//c../d..e/' => 'a../b/c../d..e',
	'../c////d' => '../c/d',
	// With dots
	'a/b/./././..' => 'a',
	'a/.b/./../' => 'a',
	'/a/b/.c/d/../e/f' => '/a/b/.c/e/f',
	'.a/./b/c/.././../e./f' => '.a/e./f',
	// Special cases
	'/' => '/',
	'.' => '.',
	'..' => '..',
	'/..' => '..', // Invalid
	'./' => '.',
	'../' => '..',
	'/.' => '/',
	];

	// preg_replace() based method, I'm surprised this is actually twice faster than
	// the array_splice() based method.
	function normalizePath($string)
	{
	// Handle windows gracefully
	if (\DIRECTORY_SEPARATOR !== '/') {
	$string = \str_replace(\DIRECTORY_SEPARATOR, '/', $string);
	}
	// Also tests some special cases we can't really do anything with
	if (false === \strpos($string, '/') \|\| '/' === $string \|\| '.' === $string \|\| '..' === $string) {
	return $string;
	}
	// This is supposedly invalid, but an empty string is an empty string
	if ('' === ($string = \rtrim($string, '/'))) {
	return '';
	}

	$scheme = null;
	if (\strpos($string, '://')) {
	list($scheme, $string) = \explode('://', $string, 2);
	}

	// Matches useless '.' repetitions
	$string = \preg_replace('@^\./\|(/\.)+/\|/\.$@', '/', $string);

	$count = 0;
	do {
	// string such as '//' can be generated by the first regex, hence the second
	$string = \preg_replace('@[^/]+/+\.\.(/+\|$)@', '$2', \preg_replace('@//+@', '/', $string), -1, $count);
	} while ($count);

	// rtrim() a second time because preg_replace() could leave a trailing '/'
	return ($scheme ? ($scheme.'://') : '').\rtrim($string, '/');
	}

	// array_splice() version, this is actually more or less the same implementation
	// as python's own os.path.normpath() function, used by os.path.abspath() method
	// which is itself in use in os.path.realpath() method.
	function normalizePath2($string)
	{
	// Handle windows gracefully
	if (\DIRECTORY_SEPARATOR !== '/') {
	$string = \str_replace(\DIRECTORY_SEPARATOR, '/', $string);
	}
	// Also tests some special cases we can't really do anything with
	if (false === \strpos($string, '/') \|\| '/' === $string \|\| '.' === $string \|\| '..' === $string) {
	return $string;
	}
	if ('' === ($string = rtrim($string, '/'))) {
	return '';
	}

	$scheme = null;
	if (strpos($string, '://')) {
	list($scheme, $string) = explode('://', $string, 2);
	}

	$segments = explode('/', $string);
	$absolute = '/' === $string[0];
	$start = 0;

	do {
	$continue = false;
	$length = count($segments);

	for ($i = $start; $i < $length; ++$i) {
	$value = $segments[$i];
	if ('' === $value \|\| '.' === $value) { // No-op values
	$start = $i;
	array_splice($segments, $i, 1);
	$continue = true; break;
	}
	if ('..' === $value && $i > 0) { // Back in hierarchy, drop previous
	array_splice($segments, $i - 1, 2);
	$start = $i == 1 ? 0 : $i - 2;
	$continue = true; break;
	}
	}
	} while ($continue);

	return ($absolute ? '/' : ($scheme ? ($scheme.'://') : '')).implode('/', $segments);
	}

	echo "\n\nnormalizePath()\n";
	foreach ($test as $string => $expected) {
	$ret = normalizePath($string);
	if ($ret === $expected) {
	echo "OK $string -> $ret\n";
	} else {
	echo "FAIL $string -> $ret (expected: $expected)\n";
	}
	}

	echo "\n\nnormalizePath2()\n";
	foreach ($test as $string => $expected) {
	$ret = normalizePath2($string);
	if ($ret === $expected) {
	echo "OK $string -> $ret\n";
	} else {
	echo "FAIL $string -> $ret (expected: $expected)\n";
	}
	}

	echo "\n\nnormalizePath() performance... ";
	$time = microtime(true);
	for ($i = 0; $i < 1000; ++$i) {
	foreach ($testData as $string) {
	$ret1 = normalizePath($string);
	}
	}
	echo (microtime(true) - $time)," sec for 100,000 calls\n";

	echo "\n\nnormalizePath2() performance... ";
	$time = microtime(true);
	for ($i = 0; $i < 1000; ++$i) {
	foreach ($testData as $string) {
	$ret1 = normalizePath2($string);
	}
	}
	echo (microtime(true) - $time)," sec for 100,000 calls\n";