Skip to content
Permalink
Newer
Older
100644 660 lines (554 sloc) 23.2 KB
1
"""Functions that read and write gzipped files.
2
3
The user of the file doesn't have to worry about the compression,
4
but random access is not allowed."""
5
6
# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
8
import struct, sys, time, os
9
import zlib
14
__all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"]
16
FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
17
18
READ, WRITE = 1, 2
19
20
_COMPRESS_LEVEL_FAST = 1
21
_COMPRESS_LEVEL_TRADEOFF = 6
22
_COMPRESS_LEVEL_BEST = 9
23
24
READ_BUFFER_SIZE = 128 * 1024
25
26
27
def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
28
encoding=None, errors=None, newline=None):
29
"""Open a gzip-compressed file in binary or text mode.
30
31
The filename argument can be an actual filename (a str or bytes object), or
32
an existing file object to read from or write to.
33
34
The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
35
binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
36
"rb", and the default compresslevel is 9.
37
38
For binary mode, this function is equivalent to the GzipFile constructor:
39
GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
40
and newline arguments must not be provided.
41
42
For text mode, a GzipFile object is created, and wrapped in an
43
io.TextIOWrapper instance with the specified encoding, error handling
44
behavior, and line ending(s).
45
46
"""
47
if "t" in mode:
48
if "b" in mode:
49
raise ValueError("Invalid mode: %r" % (mode,))
50
else:
51
if encoding is not None:
52
raise ValueError("Argument 'encoding' not supported in binary mode")
53
if errors is not None:
54
raise ValueError("Argument 'errors' not supported in binary mode")
55
if newline is not None:
56
raise ValueError("Argument 'newline' not supported in binary mode")
57
58
gz_mode = mode.replace("t", "")
59
if isinstance(filename, (str, bytes, os.PathLike)):
60
binary_file = GzipFile(filename, gz_mode, compresslevel)
61
elif hasattr(filename, "read") or hasattr(filename, "write"):
62
binary_file = GzipFile(None, gz_mode, compresslevel, filename)
63
else:
64
raise TypeError("filename must be a str or bytes object, or a file")
65
67
encoding = io.text_encoding(encoding)
68
return io.TextIOWrapper(binary_file, encoding, errors, newline)
69
else:
70
return binary_file
71
72
def write32u(output, value):
73
# The L format writes the bit pattern correctly whether signed
74
# or unsigned.
75
output.write(struct.pack("<L", value))
76
77
class _PaddedFile:
78
"""Minimal read-only file object that prepends a string to the contents
79
of an actual file. Shouldn't be used outside of gzip.py, as it lacks
80
essential functionality."""
81
82
def __init__(self, f, prepend=b''):
83
self._buffer = prepend
84
self._length = len(prepend)
85
self.file = f
86
self._read = 0
87
88
def read(self, size):
89
if self._read is None:
90
return self.file.read(size)
91
if self._read + size <= self._length:
92
read = self._read
93
self._read += size
94
return self._buffer[read:self._read]
95
else:
96
read = self._read
97
self._read = None
98
return self._buffer[read:] + \
99
self.file.read(size-self._length+read)
100
101
def prepend(self, prepend=b''):
102
if self._read is None:
103
self._buffer = prepend
104
else: # Assume data was read since the last prepend() call
105
self._read -= len(prepend)
106
return
107
self._length = len(self._buffer)
108
self._read = 0
109
111
self._read = None
112
self._buffer = None
115
def seekable(self):
116
return True # Allows fast-forwarding even in unseekable streams
118
119
class BadGzipFile(OSError):
120
"""Exception raised in some cases for invalid gzip files."""
121
122
123
class GzipFile(_compression.BaseStream):
124
"""The GzipFile class simulates most of the methods of a file object with
125
the exception of the truncate() method.
127
This class only supports opening files in binary mode. If you need to open a
128
compressed file in text mode, use the gzip.open() function.
132
# Overridden with internal file object to be closed, if only a filename
133
# is passed in
136
def __init__(self, filename=None, mode=None,
137
compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None):
138
"""Constructor for the GzipFile class.
139
140
At least one of fileobj and filename must be given a
141
non-trivial value.
142
143
The new class instance is based on fileobj, which can be a regular
144
file, an io.BytesIO object, or any other object which simulates a file.
145
It defaults to None, in which case filename is opened to provide
146
a file object.
147
148
When fileobj is not None, the filename argument is only used to be
149
included in the gzip file header, which may include the original
150
filename of the uncompressed file. It defaults to the filename of
151
fileobj, if discernible; otherwise, it defaults to the empty string,
152
and in this case the original filename is not included in the header.
153
154
The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
155
'xb' depending on whether the file will be read or written. The default
156
is the mode of fileobj if discernible; otherwise, the default is 'rb'.
157
A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
158
'wb', 'a' and 'ab', and 'x' and 'xb'.
160
The compresslevel argument is an integer from 0 to 9 controlling the
161
level of compression; 1 is fastest and produces the least compression,
162
and 9 is slowest and produces the most compression. 0 is no compression
163
at all. The default is 9.
165
The mtime argument is an optional numeric timestamp to be written
166
to the last modification time field in the stream when compressing.
167
If omitted or None, the current time is used.
171
if mode and ('t' in mode or 'U' in mode):
172
raise ValueError("Invalid mode: {!r}".format(mode))
173
if mode and 'b' not in mode:
174
mode += 'b'
176
fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
178
filename = getattr(fileobj, 'name', '')
179
if not isinstance(filename, (str, bytes)):
181
else:
182
filename = os.fspath(filename)
185
mode = getattr(fileobj, 'mode', 'rb')
187
if mode.startswith('r'):
189
raw = _GzipReader(fileobj)
190
self._buffer = io.BufferedReader(raw)
191
self.name = filename
193
elif mode.startswith(('w', 'a', 'x')):
194
if origmode is None:
195
import warnings
196
warnings.warn(
197
"GzipFile was opened for writing, but this will "
198
"change in future Python releases. "
199
"Specify the mode argument for opening it for writing.",
200
FutureWarning, 2)
201
self.mode = WRITE
202
self._init_write(filename)
203
self.compress = zlib.compressobj(compresslevel,
204
zlib.DEFLATED,
205
-zlib.MAX_WBITS,
206
zlib.DEF_MEM_LEVEL,
207
0)
210
raise ValueError("Invalid mode: {!r}".format(mode))
211
212
self.fileobj = fileobj
213
214
if self.mode == WRITE:
215
self._write_gzip_header(compresslevel)
217
@property
218
def mtime(self):
219
"""Last modification time read from stream, or None"""
220
return self._buffer.raw._last_mtime
221
222
def __repr__(self):
224
return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
225
226
def _init_write(self, filename):
227
self.name = filename
229
self.size = 0
230
self.writebuf = []
231
self.bufsize = 0
232
self.offset = 0 # Current file offset for seek(), tell(), etc
234
def _write_gzip_header(self, compresslevel):
235
self.fileobj.write(b'\037\213') # magic header
236
self.fileobj.write(b'\010') # compression method
238
# RFC 1952 requires the FNAME field to be Latin-1. Do not
239
# include filenames that cannot be represented that way.
240
fname = os.path.basename(self.name)
241
if not isinstance(fname, bytes):
242
fname = fname.encode('latin-1')
243
if fname.endswith(b'.gz'):
244
fname = fname[:-3]
245
except UnicodeEncodeError:
246
fname = b''
247
flags = 0
248
if fname:
249
flags = FNAME
250
self.fileobj.write(chr(flags).encode('latin-1'))
252
if mtime is None:
253
mtime = time.time()
254
write32u(self.fileobj, int(mtime))
255
if compresslevel == _COMPRESS_LEVEL_BEST:
256
xfl = b'\002'
257
elif compresslevel == _COMPRESS_LEVEL_FAST:
258
xfl = b'\004'
259
else:
260
xfl = b'\000'
261
self.fileobj.write(xfl)
264
self.fileobj.write(fname + b'\000')
265
266
def write(self,data):
268
if self.mode != WRITE:
269
import errno
270
raise OSError(errno.EBADF, "write() on read-only GzipFile object")
272
if self.fileobj is None:
273
raise ValueError("write() on closed GzipFile object")
275
if isinstance(data, (bytes, bytearray)):
276
length = len(data)
277
else:
278
# accept any data that supports the buffer protocol
279
data = memoryview(data)
280
length = data.nbytes
282
if length > 0:
283
self.fileobj.write(self.compress.compress(data))
284
self.size += length
285
self.crc = zlib.crc32(data, self.crc)
292
if self.mode != READ:
293
import errno
294
raise OSError(errno.EBADF, "read() on write-only GzipFile object")
295
return self._buffer.read(size)
298
"""Implements BufferedIOBase.read1()
299
300
Reads up to a buffer's worth of data if size is negative."""
302
if self.mode != READ:
303
import errno
304
raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
306
if size < 0:
307
size = io.DEFAULT_BUFFER_SIZE
308
return self._buffer.read1(size)
310
def peek(self, n):
312
if self.mode != READ:
313
import errno
314
raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
315
return self._buffer.peek(n)
317
@property
318
def closed(self):
319
return self.fileobj is None
320
321
def close(self):
322
fileobj = self.fileobj
323
if fileobj is None:
325
self.fileobj = None
326
try:
327
if self.mode == WRITE:
328
fileobj.write(self.compress.flush())
329
write32u(fileobj, self.crc)
330
# self.size may exceed 2 GiB, or even 4 GiB
331
write32u(fileobj, self.size & 0xffffffff)
332
elif self.mode == READ:
333
self._buffer.close()
334
finally:
335
myfileobj = self.myfileobj
336
if myfileobj:
337
self.myfileobj = None
338
myfileobj.close()
340
def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
343
# Ensure the compressor's buffer is flushed
344
self.fileobj.write(self.compress.flush(zlib_mode))
345
self.fileobj.flush()
347
def fileno(self):
348
"""Invoke the underlying file object's fileno() method.
349
350
This will raise AttributeError if the underlying file object
351
doesn't support fileno().
352
"""
353
return self.fileobj.fileno()
354
355
def rewind(self):
356
'''Return the uncompressed stream file position indicator to the
357
beginning of the file'''
359
raise OSError("Can't rewind in write mode")
362
def readable(self):
363
return self.mode == READ
364
365
def writable(self):
366
return self.mode == WRITE
367
368
def seekable(self):
369
return True
370
371
def seek(self, offset, whence=io.SEEK_SET):
373
if whence != io.SEEK_SET:
374
if whence == io.SEEK_CUR:
375
offset = self.offset + offset
376
else:
377
raise ValueError('Seek from end not supported')
379
raise OSError('Negative seek in write mode')
382
for i in range(count // 1024):
384
self.write(b'\0' * (count % 1024))
386
self._check_not_closed()
387
return self._buffer.seek(offset, whence)
389
return self.offset
390
391
def readline(self, size=-1):
392
self._check_not_closed()
393
return self._buffer.readline(size)
394
395
396
def _read_exact(fp, n):
397
'''Read exactly *n* bytes from `fp`
398
399
This method is required because fp may be unbuffered,
400
i.e. return short reads.
401
'''
402
data = fp.read(n)
403
while len(data) < n:
404
b = fp.read(n - len(data))
405
if not b:
406
raise EOFError("Compressed file ended before the "
407
"end-of-stream marker was reached")
408
data += b
409
return data
410
411
412
def _read_gzip_header(fp):
413
'''Read a gzip header from `fp` and progress to the end of the header.
414
415
Returns last mtime if header was present or None otherwise.
416
'''
417
magic = fp.read(2)
418
if magic == b'':
419
return None
420
421
if magic != b'\037\213':
422
raise BadGzipFile('Not a gzipped file (%r)' % magic)
423
424
(method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8))
425
if method != 8:
426
raise BadGzipFile('Unknown compression method')
427
428
if flag & FEXTRA:
429
# Read & discard the extra field, if present
430
extra_len, = struct.unpack("<H", _read_exact(fp, 2))
431
_read_exact(fp, extra_len)
432
if flag & FNAME:
433
# Read and discard a null-terminated string containing the filename
434
while True:
435
s = fp.read(1)
436
if not s or s==b'\000':
437
break
438
if flag & FCOMMENT:
439
# Read and discard a null-terminated string containing a comment
440
while True:
441
s = fp.read(1)
442
if not s or s==b'\000':
443
break
444
if flag & FHCRC:
445
_read_exact(fp, 2) # Read & discard the 16-bit header CRC
446
return last_mtime
447
448
449
class _GzipReader(_compression.DecompressReader):
450
def __init__(self, fp):
451
super().__init__(_PaddedFile(fp), zlib._ZlibDecompressor,
452
wbits=-zlib.MAX_WBITS)
453
# Set flag indicating start of a new member
454
self._new_member = True
455
self._last_mtime = None
456
457
def _init_read(self):
459
self._stream_size = 0 # Decompressed size of unconcatenated stream
460
461
def _read_gzip_header(self):
462
last_mtime = _read_gzip_header(self._fp)
463
if last_mtime is None:
466
return True
467
468
def read(self, size=-1):
470
return self.readall()
471
# size=0 is special because decompress(max_length=0) is not supported
472
if not size:
473
return b""
474
475
# For certain input data, a single
476
# call to decompress() may not return
477
# any data. In this case, retry until we get some data or reach EOF.
478
while True:
479
if self._decompressor.eof:
480
# Ending case: we've come to the end of a member in the file,
481
# so finish up this member, and read a new gzip header.
482
# Check the CRC and file size, and set the flag so we read
483
# a new member
484
self._read_eof()
485
self._new_member = True
486
self._decompressor = self._decomp_factory(
487
**self._decomp_args)
488
489
if self._new_member:
490
# If the _new_member flag is set, we have to
491
# jump to the next member, if there is one.
492
self._init_read()
493
if not self._read_gzip_header():
494
self._size = self._pos
495
return b""
496
self._new_member = False
497
498
# Read a chunk of data from the file
499
if self._decompressor.needs_input:
500
buf = self._fp.read(READ_BUFFER_SIZE)
501
uncompress = self._decompressor.decompress(buf, size)
502
else:
503
uncompress = self._decompressor.decompress(b"", size)
505
if self._decompressor.unused_data != b"":
506
# Prepend the already read bytes to the fileobj so they can
507
# be seen by _read_eof() and _read_gzip_header()
508
self._fp.prepend(self._decompressor.unused_data)
509
510
if uncompress != b"":
512
if buf == b"":
513
raise EOFError("Compressed file ended before the "
514
"end-of-stream marker was reached")
516
self._crc = zlib.crc32(uncompress, self._crc)
517
self._stream_size += len(uncompress)
518
self._pos += len(uncompress)
519
return uncompress
521
def _read_eof(self):
522
# We've read to the end of the file
523
# We check that the computed CRC and size of the
524
# uncompressed data matches the stored values. Note that the size
525
# stored is the true file size mod 2**32.
526
crc32, isize = struct.unpack("<II", _read_exact(self._fp, 8))
528
raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
529
hex(self._crc)))
530
elif isize != (self._stream_size & 0xffffffff):
531
raise BadGzipFile("Incorrect length of data produced")
532
533
# Gzip files can be padded with zeroes and still have archives.
534
# Consume all zero bytes and set the file position to the first
535
# non-zero byte. See http://www.gzip.org/#faq8
536
c = b"\x00"
537
while c == b"\x00":
538
c = self._fp.read(1)
539
if c:
540
self._fp.prepend(c)
541
542
def _rewind(self):
543
super()._rewind()
544
self._new_member = True
546
547
def _create_simple_gzip_header(compresslevel: int,
548
mtime = None) -> bytes:
549
"""
550
Write a simple gzip header with no extra fields.
551
:param compresslevel: Compresslevel used to determine the xfl bytes.
552
:param mtime: The mtime (must support conversion to a 32-bit integer).
553
:return: A bytes object representing the gzip header.
554
"""
555
if mtime is None:
556
mtime = time.time()
557
if compresslevel == _COMPRESS_LEVEL_BEST:
558
xfl = 2
559
elif compresslevel == _COMPRESS_LEVEL_FAST:
560
xfl = 4
561
else:
562
xfl = 0
563
# Pack ID1 and ID2 magic bytes, method (8=deflate), header flags (no extra
564
# fields added to header), mtime, xfl and os (255 for unknown OS).
565
return struct.pack("<BBBBLBB", 0x1f, 0x8b, 8, 0, int(mtime), xfl, 255)
566
567
568
def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
569
"""Compress data in one shot and return the compressed string.
570
571
compresslevel sets the compression level in range of 0-9.
572
mtime can be used to set the modification time. The modification time is
573
set to the current time by default.
575
if mtime == 0:
576
# Use zlib as it creates the header with 0 mtime by default.
577
# This is faster and with less overhead.
578
return zlib.compress(data, level=compresslevel, wbits=31)
579
header = _create_simple_gzip_header(compresslevel, mtime)
580
trailer = struct.pack("<LL", zlib.crc32(data), (len(data) & 0xffffffff))
581
# Wbits=-15 creates a raw deflate block.
582
return (header + zlib.compress(data, level=compresslevel, wbits=-15) +
583
trailer)
585
586
def decompress(data):
587
"""Decompress a gzip compressed string in one shot.
588
Return the decompressed string.
589
"""
590
decompressed_members = []
591
while True:
592
fp = io.BytesIO(data)
593
if _read_gzip_header(fp) is None:
594
return b"".join(decompressed_members)
595
# Use a zlib raw deflate compressor
596
do = zlib.decompressobj(wbits=-zlib.MAX_WBITS)
597
# Read all the data except the header
598
decompressed = do.decompress(data[fp.tell():])
599
if not do.eof or len(do.unused_data) < 8:
600
raise EOFError("Compressed file ended before the end-of-stream "
601
"marker was reached")
602
crc, length = struct.unpack("<II", do.unused_data[:8])
603
if crc != zlib.crc32(decompressed):
604
raise BadGzipFile("CRC check failed")
605
if length != (len(decompressed) & 0xffffffff):
606
raise BadGzipFile("Incorrect length of data produced")
607
decompressed_members.append(decompressed)
608
data = do.unused_data[8:].lstrip(b"\x00")
611
def main():
612
from argparse import ArgumentParser
613
parser = ArgumentParser(description=
614
"A simple command line interface for the gzip module: act like gzip, "
615
"but do not delete the input file.")
616
group = parser.add_mutually_exclusive_group()
617
group.add_argument('--fast', action='store_true', help='compress faster')
618
group.add_argument('--best', action='store_true', help='compress better')
619
group.add_argument("-d", "--decompress", action="store_true",
620
help="act like gunzip instead of gzip")
622
parser.add_argument("args", nargs="*", default=["-"], metavar='file')
623
args = parser.parse_args()
624
625
compresslevel = _COMPRESS_LEVEL_TRADEOFF
626
if args.fast:
627
compresslevel = _COMPRESS_LEVEL_FAST
628
elif args.best:
629
compresslevel = _COMPRESS_LEVEL_BEST
630
631
for arg in args.args:
632
if args.decompress:
634
f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
635
g = sys.stdout.buffer
636
else:
637
if arg[-3:] != ".gz":
638
sys.exit(f"filename doesn't end in .gz: {arg!r}")
640
g = builtins.open(arg[:-3], "wb")
641
else:
642
if arg == "-":
644
g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer,
645
compresslevel=compresslevel)
647
f = builtins.open(arg, "rb")
648
g = open(arg + ".gz", "wb")
650
chunk = f.read(READ_BUFFER_SIZE)
651
if not chunk:
652
break
653
g.write(chunk)
654
if g is not sys.stdout.buffer:
658
659
if __name__ == '__main__':