Last active
June 26, 2018 11:24
-
-
Save pounard/4c242bf94e36d4e8f8f1ae68021ec45c to your computer and use it in GitHub Desktop.
Path normalization, equivalent of Python's os.path.normpath() and os.path.absath()
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// Generate test data for performance test | |
$testData = []; | |
$words = ['..', 'test', 'pouet', '/', '..', '', 'usr', 'bin', 'firefox', 'do not', 'panic', 'secret', 'perso', 'private', 'foo', '/', '..', '', 'hyper long word', 'word with space', 'some56457', '@ert', '.ezrzer', 'cassoulet', '/', '..', '']; | |
for ($i = 0; $i < 100; ++$i) { | |
$value = []; | |
$count = rand(0, 20); | |
for ($j = 1; $j < $count; ++$j) { | |
$value[] = $words[rand(0, count($words) - 1)]; | |
} | |
$value = implode('/', $value); | |
if (rand(0, 4) < 1) { | |
$value = '/'.$value; | |
} | |
if (rand(0, 4) < 1) { | |
$value = $value . '/'; | |
} | |
$testData[] = $value; | |
} | |
print_r($testData); | |
$test = [ | |
// Tests with '..' | |
'a/b/..' => 'a', | |
'https://a/b/../' => 'https://a', | |
'/a/b/c/d/../e/f' => '/a/b/c/e/f', | |
'a/b/c/../../e/f' => 'a/e/f', | |
'ftp://a/../b/../c/../e/f' => 'ftp://e/f', | |
'a../b/c../d..e/' => 'a../b/c../d..e', | |
'../c/d' => '../c/d', | |
// With multiple '/' | |
'/a/b/////c/d/../e/f' => '/a/b/c/e/f', | |
'file:////a/b/c//../..//e/f' => 'file:///a/e/f', | |
'////a/../b/../c//../e/f' => '/e/f', | |
'a../b//c../d..e/' => 'a../b/c../d..e', | |
'../c////d' => '../c/d', | |
// With dots | |
'a/b/./././..' => 'a', | |
'a/.b/./../' => 'a', | |
'/a/b/.c/d/../e/f' => '/a/b/.c/e/f', | |
'.a/./b/c/.././../e./f' => '.a/e./f', | |
// Special cases | |
'/' => '/', | |
'.' => '.', | |
'..' => '..', | |
'/..' => '..', // Invalid | |
'./' => '.', | |
'../' => '..', | |
'/.' => '/', | |
]; | |
// preg_replace() based method, I'm surprised this is actually twice faster than | |
// the array_splice() based method. | |
function normalizePath($string) | |
{ | |
// Handle windows gracefully | |
if (\DIRECTORY_SEPARATOR !== '/') { | |
$string = \str_replace(\DIRECTORY_SEPARATOR, '/', $string); | |
} | |
// Also tests some special cases we can't really do anything with | |
if (false === \strpos($string, '/') || '/' === $string || '.' === $string || '..' === $string) { | |
return $string; | |
} | |
// This is supposedly invalid, but an empty string is an empty string | |
if ('' === ($string = \rtrim($string, '/'))) { | |
return ''; | |
} | |
$scheme = null; | |
if (\strpos($string, '://')) { | |
list($scheme, $string) = \explode('://', $string, 2); | |
} | |
// Matches useless '.' repetitions | |
$string = \preg_replace('@^\./|(/\.)+/|/\.$@', '/', $string); | |
$count = 0; | |
do { | |
// string such as '//' can be generated by the first regex, hence the second | |
$string = \preg_replace('@[^/]+/+\.\.(/+|$)@', '$2', \preg_replace('@//+@', '/', $string), -1, $count); | |
} while ($count); | |
// rtrim() a second time because preg_replace() could leave a trailing '/' | |
return ($scheme ? ($scheme.'://') : '').\rtrim($string, '/'); | |
} | |
// array_splice() version, this is actually more or less the same implementation | |
// as python's own os.path.normpath() function, used by os.path.abspath() method | |
// which is itself in use in os.path.realpath() method. | |
function normalizePath2($string) | |
{ | |
// Handle windows gracefully | |
if (\DIRECTORY_SEPARATOR !== '/') { | |
$string = \str_replace(\DIRECTORY_SEPARATOR, '/', $string); | |
} | |
// Also tests some special cases we can't really do anything with | |
if (false === \strpos($string, '/') || '/' === $string || '.' === $string || '..' === $string) { | |
return $string; | |
} | |
if ('' === ($string = rtrim($string, '/'))) { | |
return ''; | |
} | |
$scheme = null; | |
if (strpos($string, '://')) { | |
list($scheme, $string) = explode('://', $string, 2); | |
} | |
$segments = explode('/', $string); | |
$absolute = '/' === $string[0]; | |
$start = 0; | |
do { | |
$continue = false; | |
$length = count($segments); | |
for ($i = $start; $i < $length; ++$i) { | |
$value = $segments[$i]; | |
if ('' === $value || '.' === $value) { // No-op values | |
$start = $i; | |
array_splice($segments, $i, 1); | |
$continue = true; break; | |
} | |
if ('..' === $value && $i > 0) { // Back in hierarchy, drop previous | |
array_splice($segments, $i - 1, 2); | |
$start = $i == 1 ? 0 : $i - 2; | |
$continue = true; break; | |
} | |
} | |
} while ($continue); | |
return ($absolute ? '/' : ($scheme ? ($scheme.'://') : '')).implode('/', $segments); | |
} | |
echo "\n\nnormalizePath()\n"; | |
foreach ($test as $string => $expected) { | |
$ret = normalizePath($string); | |
if ($ret === $expected) { | |
echo "OK $string -> $ret\n"; | |
} else { | |
echo "FAIL $string -> $ret (expected: $expected)\n"; | |
} | |
} | |
echo "\n\nnormalizePath2()\n"; | |
foreach ($test as $string => $expected) { | |
$ret = normalizePath2($string); | |
if ($ret === $expected) { | |
echo "OK $string -> $ret\n"; | |
} else { | |
echo "FAIL $string -> $ret (expected: $expected)\n"; | |
} | |
} | |
echo "\n\nnormalizePath() performance... "; | |
$time = microtime(true); | |
for ($i = 0; $i < 1000; ++$i) { | |
foreach ($testData as $string) { | |
$ret1 = normalizePath($string); | |
} | |
} | |
echo (microtime(true) - $time)," sec for 100,000 calls\n"; | |
echo "\n\nnormalizePath2() performance... "; | |
$time = microtime(true); | |
for ($i = 0; $i < 1000; ++$i) { | |
foreach ($testData as $string) { | |
$ret1 = normalizePath2($string); | |
} | |
} | |
echo (microtime(true) - $time)," sec for 100,000 calls\n"; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment