Skip to content

Instantly share code, notes, and snippets.

@pansila
Last active December 14, 2023 08:57
Show Gist options
  • Save pansila/7bec1af0e8fda4dc235a5eee9986fdd5 to your computer and use it in GitHub Desktop.
Save pansila/7bec1af0e8fda4dc235a5eee9986fdd5 to your computer and use it in GitHub Desktop.
cut large file, faster than system's cut by seeking to the target position instead of by chars or lines
#!/usr/bin/env python3
import os, sys
import argparse
try:
from tqdm import tqdm
except ModuleNotFoundError:
def tqdm(it):
return it
BLOCK_SIZE = 1024 * 64
def convert_percent(arg, total):
if arg is None:
return 0
return arg if arg >= 1 else int(total * arg)
def main(args):
ret = validate_args(args)
if ret:
return ret
file_length = os.path.getsize(args.input)
cur = start = convert_percent(args.start, file_length)
length = convert_percent(args.length, file_length)
end = args.end or args.length and min(start + length, file_length) or file_length
output = args.output or f'{args.input}.cut'
count = args.count or 1
size = max(end - start, BLOCK_SIZE) // count
size = (size + BLOCK_SIZE - 1) // BLOCK_SIZE * BLOCK_SIZE
#print(f'params: {file_length=} {start=} {length=} {end=} {output=} {count=} {size=}')
with open(args.input, 'b+r') as inf:
inf.seek(start)
for cnt in range(1, count + 1):
output_file = f"{output}.{cnt}" if count > 1 else output
next = min(cur + size, file_length)
next_size = next - cur
print(f'Saving "{args.input}" to "{output_file}" from position {cur} to {next} (size={next_size:,})')
with open(output_file, 'b+w') as outf:
for i in tqdm(range((next_size + BLOCK_SIZE - 1) // BLOCK_SIZE)):
r = inf.read(BLOCK_SIZE)
if len(r) == 0:
break
outf.write(r)
else:
cur = inf.tell()
continue
break
def validate_args(args):
if args.end and args.end < args.start:
print(f'{args.end=} is smaller than {args.start=}')
return -1
return 0
def convert_args(args):
for k, v in args._get_kwargs():
if k in ('input', 'output'):
continue
if v is None:
setattr(args, k, 0)
elif v.endswith('G'):
setattr(args, k, int(v[:-1]) * 1000 * 1000 * 1000)
elif v.endswith('Gi'):
setattr(args, k, int(v[:-2]) * 1024 * 1024 * 1024)
elif v.endswith('M'):
setattr(args, k, int(v[:-1]) * 1000 * 1000)
elif v.endswith('Mi'):
setattr(args, k, int(v[:-2]) * 1024 * 1024)
elif v.endswith('K'):
setattr(args, k, int(v[:-1]) * 1000)
elif v.endswith('Ki'):
setattr(args, k, int(v[:-2]) * 1024)
elif v.endswith('%'):
setattr(args, k, int(v[:-1]) / 100)
else:
setattr(args, k, int(v))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('input')
parser.add_argument('-s', '--start')
parser.add_argument('-e', '--end')
parser.add_argument('-l', '--length')
parser.add_argument('-c', '--count', help="the count of split files with the same size")
parser.add_argument('-o', '--output')
args = parser.parse_args()
convert_args(args)
sys.exit(main(args))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment