Entry
ASCII delimited files
Jul 5th, 2000 10:02
Nathan Wallace, Hans Nowak, Snippet 282, Mike Fletcher
"""
Packages: text.delimited_files
"""
'''Parse strings representing comma-seperated values into lists of
strings
Strips leading and trailing whitespace
Respects both ' and " as string markers
Can use any single-character seperator character for record and field
Does not do value conversions (i.e. doesn't do float/int conversions)
Module is hereby released for any use, but without guarantee of fitness or
correctnes.
Use at your own risk.
'''
import string
def parse( file, recordSeperator='\n', fieldSeperator ='\t',
stepsize=800):
file = string.split( file, recordSeperator)
file = map( string.split, file, [fieldSeperator]*len(file) )
result = []
lstrip, rstrip = string.lstrip, string.rstrip
while file:
lines = file[:stepsize]
while lines:
record = []
while lines[0]:
# while this line is continued...
# iterate over fragments, add if "normal"
# do continuation if starts with " or '
data = lstrip(lines[0][0])
if data and data[0] == '"':
# is a quote...
while rstrip(data)[-1] != '"':
if len( lines[0] )>1 :
# there is another fragment on this line....
data = data+fieldSeperator+lines[0][1]
del lines[0][1]
elif len( lines) > 1:
# there is another record after this to be concatenated
del lines[0]
data = data+recordSeperator+lines[0][0]
elif len(file)>stepsize:
# there is another record after the processing block
del file[:stepsize]
lines = file[:stepsize]
data = data+recordSeperator+lines[0][0]
else:
# egads! malformed data! the " never ends!
raise ValueError, ("Malformed field value %s, " +
"quoted value never ends"% `data` )
# remove quotes
data = data[1:-1]
# following is not very elegant (repeated code)
elif data and data[0] == "'":
# is a quote...
while rstrip(data)[-1] != "'":
if len( lines[0] )>1 :
# there is another fragment on this line....
data = data+fieldSeperator+lines[0][1]
del lines[0][1]
elif len( lines) > 1:
# there is another record after this to be concatenated
del lines[0]
data = data+recordSeperator+lines[0][0]
elif len(file)>stepsize:
# there is another record after the processing block
del file[:stepsize]
lines = file[:stepsize]
data = data+recordSeperator+lines[0][0]
else:
# egads! malformed data! the " never ends!
raise ValueError, ("Malformed field value %s, "
+ "quoted value never ends"% `data` )
# remove quotes
data = data[1:-1]
# append to record, clearing off trailing whitespace...
record.append( rstrip(data) )
del lines[0][0]
# print record
result.append( record )
del lines[0]
del file[:stepsize]
return result
baseTestData = '''1, 2,3, "vier", "quo""te", "embedded, comma", "this
is with a newline", here
another record
And another, "should end here"
"should not be attached to previous"
'''
baseSharedTestData = '''1, 2,3, "vier", "quo""te", "this
is with a newline", here
another record\n'''
def sizeTest( size, sharedData=0, stepsize=800 ):
import time
if not sharedData:
data = baseTestData*size
else:
data = baseSharedTestData*size
t = time.time()
result = parse(data, fieldSeperator=',', stepsize=stepsize)
dif = time.time()-t
print 'Time for file of %s bytes\n (%s * testset, %s records found):\n%s'%(
len(data), size, len(result), dif)
return result
if __name__ == "__main__":
for record in parse(baseTestData, fieldSeperator=','):
print record
sizeTest( 5, 1)
sizeTest( 50, 1)
sizeTest( 500, 1)
sizeTest( 5000, 1)
print "next test may take 15 seconds or so"
sizeTest( 50000, 1)
print "next test may take 45 seconds or so"
sizeTest( 150000, 1)
#print "next test may take 450 seconds or so"
#sizeTest( 500000, 1)