faqts : Computers : Programming : Languages : Python : Snippets

+ Search
Add Entry AlertManage Folder Edit Entry Add page to http://del.icio.us/
Did You Find This Entry Useful?

1 of 1 people (100%) answered Yes

Entry

ASCII delimited files

Jul 5th, 2000 10:02
Nathan Wallace, Hans Nowak, Snippet 281, D'Arcy J.M. Cain


"""
Packages: text.delimited_files
"""
"""
D'Arcy J.M. Cain <darcy@vex.net> wrote:
> Skip it.  There's a bug.  If a line ends with a quoted entry it gets
> attached to the next one.  I'm tracking it down now and will post the
> corrected one.
OK, this one does the right thing.
"""
#! /usr/bin/env python
# delimited.py
# Modified from the original by D'Arcy J.M. Cain <darcy@druid.net>
#
# CT990226 V0.1
"""
breaking tab delimited or CSV text into a list of tuples.
Original version: SMC databook plugin, readxls.py
Generalized to be usable for any delimiter but \n.
"""
import string, re
def split_delimited(s, delimiter=None) :
	"""split a delimited text file string into a list of tuples.
	Quotes are obeyed, enclosed newlines are expanded to tab,
	double quotes go to quotes"""
	# 980426 finding line delimiter dynamically
	probe = s[:10000]
	eol = findlinedelimiter(probe)
	# 990226 guessing field delimiter from '\t,' if not supplied
	if not delimiter:
		candidates = [
			(string.count(probe, '\t'), '\t'),
			(string.count(probe, ','), ','),
		]
		candidates.sort()
		delimiter = candidates[-1][-1]
	del probe
	# the trick to make this handy is to use SEP as a placeholder
	# Kind of ugly but it works for embedded commas - DJMC
	inquote = 0
	SEP='|'
	for i in range(len(s)):
		if s[i] == '"': inquote = (inquote == 0)
		if inquote == 0 and s[i] == delimiter:
			s = s[:i] + SEP + s[i + 1:]
	parts = string.split(s, '"')
	limits = (0, len(parts)-1)
	for i in range(len(parts)) :
		part = parts[i]
		if i%2 :
			part = string.replace(part, eol, delimiter)
		else :
			if not part and i not in limits: part = '"'
		parts[i] = part
	# merge it back
	txt = string.join(parts, "")
	parts = string.split(txt, eol)
	# now break by SEP
	for i in range(len(parts)) :
		# strip leading and trailing spaces
		#fields = string.split(parts[i], SEP)
		fields = re.split(r' *\0 *', parts[i])
		parts[i] = tuple(fields)
	return parts
# utilities
def findlinedelimiter(txt) :
	"""
	provide some kb of text to this function. It will determine
	the best delimiter and therefore guess the system
	"""
	mac  = "\x0D"
	unix = "\x0A"
	dos  = mac+unix
	oses = [dos, unix, mac]
	# find the one which gives the most lines.
	# in doubt, the longest delimiter wins.
	lis = []
	while txt and txt[-1] in dos: txt = txt[:-1] # CT970904
	for delim in oses:
		lis.append((len(string.split(txt, delim)), delim))
	lis.sort();
	return lis[-1][-1]
if __name__=="__main__":
	for l in  split_delimited('''1, 2,3, "vier", "quo""te", "embedded,
comma", "this
is with a newline", here
another record
And another, "should end here"
"should not be attached to previous"'''):
		print l
# eof