Skip to content

pythongh-102140 : False neg csv header bug fix #102787

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 34 additions & 10 deletions Lib/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,9 +390,8 @@ def has_header(self, sample):
# column is of a single type (say, integers), *except* for the first
# row, then the first row is presumed to be labels. If the type
# can't be determined, it is assumed to be a string in which case
# the length of the string is the determining factor: if all of the
# rows except for the first are the same length, it's a header.
# Finally, a 'vote' is taken at the end for each column, adding or
# the length of the string is the determining factor.
# A 'vote' is taken at the end for each column, adding or
# subtracting from the likelihood of the first row being a header.

rdr = reader(StringIO(sample), self.sniff(sample))
Expand All @@ -401,7 +400,11 @@ def has_header(self, sample):

columns = len(header)
columnTypes = {}
for i in range(columns): columnTypes[i] = None
similiratyWords = {}
compareWords = []
for i in range(columns):
columnTypes[i] = None
similiratyWords[i] = 0

checked = 0
for row in rdr:
Expand All @@ -414,7 +417,7 @@ def has_header(self, sample):
continue # skip rows that have irregular number of columns

for col in list(columnTypes.keys()):
thisType = complex
thisType = complex #class complex
try:
thisType(row[col])
except (ValueError, OverflowError):
Expand All @@ -424,15 +427,36 @@ def has_header(self, sample):
if thisType != columnTypes[col]:
if columnTypes[col] is None: # add new column type
columnTypes[col] = thisType
compareWords.append(re.findall(r"\w+[^\s]", row[col])) #create a list of every words
else:
# type is inconsistent, remove column from
# consideration
del columnTypes[col]
if isinstance(row[col], int)==False: #it's not an integer
columnTypes[col] += thisType
compareWords.append(re.findall(r"\w+[^\s]", row[col]))
for words in compareWords[0]: #match words
if words in compareWords[1]: #if a word has been repeated
similiratyWords[col] += 1
del compareWords[0]
else:
# type is inconsistent, remove column from
# consideration
del columnTypes[col]

# finally, compare results against first row and "vote"
# on whether it's a header
hasHeader = 0
hasHeader = 0
#checking if header label is one single word
for col in header:
if len(re.findall(r"\s", col)) > 0 :
hasHeader -= 1
break
if hasHeader == 0 :
hasHeader += 1
for col, colType in columnTypes.items():
columnTypes[col] = columnTypes[col]//checked
if similiratyWords[col]/checked < 1:
hasHeader -= 1
else:
hasHeader += 1
if isinstance(colType, int): # it's a length
if len(header[col]) != colType:
hasHeader += 1
Expand All @@ -446,4 +470,4 @@ def has_header(self, sample):
else:
hasHeader -= 1

return hasHeader > 0
return hasHeader > 0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fixing bug csv.Sniffer().has_header() which was returning False whereas there was clearly a header by improving the heuristic.