Skip to content
Permalink
Newer
Older
100644 449 lines (378 sloc) 15.9 KB
1
2
"""
3
csv.py - read/write/investigate CSV files
4
"""
5
6
import re
8
from _csv import Error, __version__, writer, reader, register_dialect, \
9
unregister_dialect, get_dialect, list_dialects, \
11
QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
12
__doc__
13
from _csv import Dialect as _Dialect
15
from io import StringIO
17
__all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
18
"Error", "Dialect", "__doc__", "excel", "excel_tab",
19
"field_size_limit", "reader", "writer",
20
"register_dialect", "get_dialect", "list_dialects", "Sniffer",
21
"unregister_dialect", "__version__", "DictReader", "DictWriter",
22
"unix_dialect"]
23
24
class Dialect:
25
"""Describe a CSV dialect.
26
27
This must be subclassed (see csv.excel). Valid attributes are:
28
delimiter, quotechar, escapechar, doublequote, skipinitialspace,
29
lineterminator, quoting.
30
31
"""
32
_name = ""
33
_valid = False
34
# placeholders
35
delimiter = None
36
quotechar = None
37
escapechar = None
38
doublequote = None
39
skipinitialspace = None
40
lineterminator = None
41
quoting = None
42
43
def __init__(self):
44
if self.__class__ != Dialect:
45
self._valid = True
47
48
def _validate(self):
51
except TypeError as e:
52
# We do this for compatibility with py2.3
53
raise Error(str(e))
54
55
class excel(Dialect):
56
"""Describe the usual properties of Excel-generated CSV files."""
57
delimiter = ','
58
quotechar = '"'
59
doublequote = True
60
skipinitialspace = False
61
lineterminator = '\r\n'
62
quoting = QUOTE_MINIMAL
63
register_dialect("excel", excel)
64
65
class excel_tab(excel):
66
"""Describe the usual properties of Excel-generated TAB-delimited files."""
67
delimiter = '\t'
68
register_dialect("excel-tab", excel_tab)
69
70
class unix_dialect(Dialect):
71
"""Describe the usual properties of Unix-generated CSV files."""
72
delimiter = ','
73
quotechar = '"'
74
doublequote = True
75
skipinitialspace = False
76
lineterminator = '\n'
77
quoting = QUOTE_ALL
78
register_dialect("unix", unix_dialect)
79
80
81
class DictReader:
82
def __init__(self, f, fieldnames=None, restkey=None, restval=None,
83
dialect="excel", *args, **kwds):
84
if fieldnames is not None and iter(fieldnames) is fieldnames:
85
fieldnames = list(fieldnames)
86
self._fieldnames = fieldnames # list of keys for the dict
87
self.restkey = restkey # key to catch long rows
88
self.restval = restval # default value for short rows
89
self.reader = reader(f, dialect, *args, **kwds)
90
self.dialect = dialect
91
self.line_num = 0
92
93
def __iter__(self):
94
return self
95
96
@property
97
def fieldnames(self):
98
if self._fieldnames is None:
99
try:
100
self._fieldnames = next(self.reader)
101
except StopIteration:
102
pass
103
self.line_num = self.reader.line_num
104
return self._fieldnames
105
106
@fieldnames.setter
107
def fieldnames(self, value):
108
self._fieldnames = value
109
111
if self.line_num == 0:
112
# Used only for its side effect.
113
self.fieldnames
115
self.line_num = self.reader.line_num
117
# unlike the basic reader, we prefer not to return blanks,
118
# because we will typically wind up with a dict full of None
119
# values
120
while row == []:
122
d = dict(zip(self.fieldnames, row))
123
lf = len(self.fieldnames)
124
lr = len(row)
125
if lf < lr:
126
d[self.restkey] = row[lf:]
127
elif lf > lr:
128
for key in self.fieldnames[lr:]:
129
d[key] = self.restval
130
return d
131
132
__class_getitem__ = classmethod(types.GenericAlias)
133
134
135
class DictWriter:
136
def __init__(self, f, fieldnames, restval="", extrasaction="raise",
137
dialect="excel", *args, **kwds):
138
if fieldnames is not None and iter(fieldnames) is fieldnames:
139
fieldnames = list(fieldnames)
140
self.fieldnames = fieldnames # list of keys for the dict
141
self.restval = restval # for writing short dicts
142
extrasaction = extrasaction.lower()
143
if extrasaction not in ("raise", "ignore"):
144
raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'"
145
% extrasaction)
146
self.extrasaction = extrasaction
147
self.writer = writer(f, dialect, *args, **kwds)
149
def writeheader(self):
150
header = dict(zip(self.fieldnames, self.fieldnames))
153
def _dict_to_list(self, rowdict):
154
if self.extrasaction == "raise":
155
wrong_fields = rowdict.keys() - self.fieldnames
156
if wrong_fields:
157
raise ValueError("dict contains fields not in fieldnames: "
158
+ ", ".join([repr(x) for x in wrong_fields]))
159
return (rowdict.get(key, self.restval) for key in self.fieldnames)
160
161
def writerow(self, rowdict):
162
return self.writer.writerow(self._dict_to_list(rowdict))
163
164
def writerows(self, rowdicts):
165
return self.writer.writerows(map(self._dict_to_list, rowdicts))
167
__class_getitem__ = classmethod(types.GenericAlias)
168
169
170
class Sniffer:
171
'''
172
"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
173
Returns a Dialect object.
174
'''
175
def __init__(self):
176
# in case there is more than one possible delimiter
177
self.preferred = [',', '\t', ';', ' ', ':']
178
179
180
def sniff(self, sample, delimiters=None):
181
"""
182
Returns a dialect (or None) corresponding to the sample
183
"""
184
185
quotechar, doublequote, delimiter, skipinitialspace = \
186
self._guess_quote_and_delimiter(sample, delimiters)
188
delimiter, skipinitialspace = self._guess_delimiter(sample,
189
delimiters)
192
raise Error("Could not determine delimiter")
194
class dialect(Dialect):
195
_name = "sniffed"
196
lineterminator = '\r\n'
198
# escapechar = ''
199
200
dialect.doublequote = doublequote
201
dialect.delimiter = delimiter
202
# _csv.reader won't accept a quotechar of ''
203
dialect.quotechar = quotechar or '"'
204
dialect.skipinitialspace = skipinitialspace
206
return dialect
207
208
209
def _guess_quote_and_delimiter(self, data, delimiters):
210
"""
211
Looks for text enclosed between two identical quotes
212
(the probable quotechar) which are preceded and followed
213
by the same character (the probable delimiter).
214
For example:
215
,'some text',
216
The quote with the most wins, same with the delimiter.
217
If there is no quotechar the delimiter can't be determined
218
this way.
219
"""
220
221
matches = []
222
for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
223
r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
224
r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
225
r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
226
regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
227
matches = regexp.findall(data)
228
if matches:
229
break
230
231
if not matches:
232
# (quotechar, doublequote, delimiter, skipinitialspace)
233
return ('', False, None, 0)
234
quotes = {}
235
delims = {}
236
spaces = 0
237
groupindex = regexp.groupindex
238
for m in matches:
240
key = m[n]
241
if key:
242
quotes[key] = quotes.get(key, 0) + 1
243
try:
245
key = m[n]
246
except KeyError:
247
continue
248
if key and (delimiters is None or key in delimiters):
249
delims[key] = delims.get(key, 0) + 1
250
try:
252
except KeyError:
253
continue
254
if m[n]:
255
spaces += 1
256
257
quotechar = max(quotes, key=quotes.get)
258
259
if delims:
260
delim = max(delims, key=delims.get)
261
skipinitialspace = delims[delim] == spaces
262
if delim == '\n': # most likely a file with a single column
263
delim = ''
264
else:
265
# there is *no* delimiter, it's a single column of quoted data
266
delim = ''
267
skipinitialspace = 0
268
269
# if we see an extra quote between delimiters, we've got a
270
# double quoted format
271
dq_regexp = re.compile(
272
r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
273
{'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
274
275
276
277
if dq_regexp.search(data):
278
doublequote = True
279
else:
280
doublequote = False
281
282
return (quotechar, doublequote, delim, skipinitialspace)
283
284
285
def _guess_delimiter(self, data, delimiters):
286
"""
287
The delimiter /should/ occur the same number of times on
288
each row. However, due to malformed data, it may not. We don't want
289
an all or nothing approach, so we allow for small variations in this
290
number.
291
1) build a table of the frequency of each character on every line.
292
2) build a table of frequencies of this frequency (meta-frequency?),
293
e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
294
7 times in 2 rows'
295
3) use the mode of the meta-frequency to determine the /expected/
296
frequency for that character
297
4) find out how often the character actually meets that goal
298
5) the character that best meets its goal is the delimiter
299
For performance reasons, the data is evaluated in chunks, so it can
300
try and evaluate the smallest portion of the data possible, evaluating
301
additional chunks as necessary.
302
"""
303
304
data = list(filter(None, data.split('\n')))
305
306
ascii = [chr(c) for c in range(127)] # 7-bit ASCII
307
308
# build frequency tables
309
chunkLength = min(10, len(data))
310
iteration = 0
311
charFrequency = {}
312
modes = {}
313
delims = {}
315
while start < len(data):
316
iteration += 1
317
for line in data[start:end]:
318
for char in ascii:
319
metaFrequency = charFrequency.get(char, {})
320
# must count even if frequency is 0
322
# value is the mode
323
metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
324
charFrequency[char] = metaFrequency
325
326
for char in charFrequency.keys():
327
items = list(charFrequency[char].items())
328
if len(items) == 1 and items[0][0] == 0:
329
continue
330
# get the mode of the frequencies
331
if len(items) > 1:
332
modes[char] = max(items, key=lambda x: x[1])
333
# adjust the mode - subtract the sum of all
334
# other frequencies
335
items.remove(modes[char])
336
modes[char] = (modes[char][0], modes[char][1]
337
- sum(item[1] for item in items))
338
else:
339
modes[char] = items[0]
340
341
# build a list of possible delimiters
342
modeList = modes.items()
343
total = float(min(chunkLength * iteration, len(data)))
344
# (rows of consistent data) / (number of rows) = 100%
345
consistency = 1.0
346
# minimum consistency threshold
347
threshold = 0.9
348
while len(delims) == 0 and consistency >= threshold:
349
for k, v in modeList:
350
if v[0] > 0 and v[1] > 0:
351
if ((v[1]/total) >= consistency and
352
(delimiters is None or k in delimiters)):
353
delims[k] = v
354
consistency -= 0.01
355
356
if len(delims) == 1:
357
delim = list(delims.keys())[0]
358
skipinitialspace = (data[0].count(delim) ==
359
data[0].count("%c " % delim))
360
return (delim, skipinitialspace)
361
362
# analyze another chunkLength lines
363
start = end
364
end += chunkLength
365
366
if not delims:
367
return ('', 0)
368
369
# if there's more than one, fall back to a 'preferred' list
370
if len(delims) > 1:
371
for d in self.preferred:
372
if d in delims.keys():
373
skipinitialspace = (data[0].count(d) ==
374
data[0].count("%c " % d))
375
return (d, skipinitialspace)
376
377
# nothing else indicates a preference, pick the character that
378
# dominates(?)
379
items = [(v,k) for (k,v) in delims.items()]
380
items.sort()
381
delim = items[-1][1]
382
383
skipinitialspace = (data[0].count(delim) ==
384
data[0].count("%c " % delim))
385
return (delim, skipinitialspace)
386
387
388
def has_header(self, sample):
389
# Creates a dictionary of types of data in each column. If any
390
# column is of a single type (say, integers), *except* for the first
391
# row, then the first row is presumed to be labels. If the type
392
# can't be determined, it is assumed to be a string in which case
393
# the length of the string is the determining factor: if all of the
394
# rows except for the first are the same length, it's a header.
395
# Finally, a 'vote' is taken at the end for each column, adding or
396
# subtracting from the likelihood of the first row being a header.
397
398
rdr = reader(StringIO(sample), self.sniff(sample))
400
header = next(rdr) # assume first row is header
401
402
columns = len(header)
403
columnTypes = {}
404
for i in range(columns): columnTypes[i] = None
405
406
checked = 0
407
for row in rdr:
408
# arbitrary number of rows to check, to keep it sane
409
if checked > 20:
410
break
411
checked += 1
412
413
if len(row) != columns:
414
continue # skip rows that have irregular number of columns
415
416
for col in list(columnTypes.keys()):
417
thisType = complex
418
try:
419
thisType(row[col])
420
except (ValueError, OverflowError):
421
# fallback to length of string
422
thisType = len(row[col])
423
424
if thisType != columnTypes[col]:
425
if columnTypes[col] is None: # add new column type
426
columnTypes[col] = thisType
427
else:
428
# type is inconsistent, remove column from
429
# consideration
430
del columnTypes[col]
431
432
# finally, compare results against first row and "vote"
433
# on whether it's a header
434
hasHeader = 0
435
for col, colType in columnTypes.items():
436
if isinstance(colType, int): # it's a length
437
if len(header[col]) != colType:
438
hasHeader += 1
439
else:
440
hasHeader -= 1
441
else: # attempt typecast
442
try:
443
colType(header[col])
444
except (ValueError, TypeError):
445
hasHeader += 1
446
else:
447
hasHeader -= 1
448
449
return hasHeader > 0