You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

267 lines
8.2 KiB

import csv
import cffi
# IN-PROGRESS. See the demo at the end of the file
def _make_ffi_from_dialect(dialect_name):
dialect = csv.get_dialect(dialect_name)
ffi = cffi.FFI()
ffi.cdef("""
long parse_line(char *rawline, long inputlength);
""")
d = {'quotechar': ord(dialect.quotechar),
'quoting': int(dialect.quoting),
'skipinitialspace': int(dialect.skipinitialspace),
'delimiter': ord(dialect.delimiter),
'doublequote': int(dialect.doublequote),
'strict': int(dialect.strict),
}
if dialect.escapechar is not None:
d['is_escape_char'] = '== %d' % ord(dialect.escapechar)
else:
d['is_escape_char'] = '&& 0'
ffi.set_source('_fastcsv_' + dialect_name, r'''
typedef enum {
START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD,
IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD,
EAT_CRNL
} ParserState;
typedef enum {
QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE
} QuoteStyle;
typedef struct {
ParserState state; /* current CSV parse state */
char *field; /* build current field in here */
int field_size; /* size of allocated buffer */
int field_len; /* length of current field */
int numeric_field; /* treat field as numeric */
} ReaderObj;
static void
parse_add_char(ReaderObj *self, char c)
{
*self->field++ = c;
}
static void
parse_save_field(ReaderObj *self)
{
*self->field++ = 0;
}
static int
parse_process_char(ReaderObj *self, char c)
{
switch (self->state) {
case START_RECORD:
/* start of record */
if (c == '\0')
/* empty line - return [] */
break;
else if (c == '\n' || c == '\r') {
self->state = EAT_CRNL;
break;
}
/* normal character - handle as START_FIELD */
self->state = START_FIELD;
/* fallthru */
case START_FIELD:
/* expecting field */
if (c == '\n' || c == '\r' || c == '\0') {
/* save empty field - return [fields] */
parse_save_field(self);
self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
}
else if (c == %(quotechar)d &&
%(quoting)d != QUOTE_NONE) {
/* start quoted field */
self->state = IN_QUOTED_FIELD;
}
else if (c %(is_escape_char)s) {
/* possible escaped character */
self->state = ESCAPED_CHAR;
}
else if (c == ' ' && %(skipinitialspace)d)
/* ignore space at start of field */
;
else if (c == %(delimiter)d) {
/* save empty field */
parse_save_field(self);
}
else {
/* begin new unquoted field */
if (%(quoting)d == QUOTE_NONNUMERIC)
self->numeric_field = 1;
parse_add_char(self, c);
self->state = IN_FIELD;
}
break;
case ESCAPED_CHAR:
if (c == '\0')
c = '\n';
parse_add_char(self, c);
self->state = IN_FIELD;
break;
case IN_FIELD:
/* in unquoted field */
if (c == '\n' || c == '\r' || c == '\0') {
/* end of line - return [fields] */
parse_save_field(self);
self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
}
else if (c %(is_escape_char)s) {
/* possible escaped character */
self->state = ESCAPED_CHAR;
}
else if (c == %(delimiter)d) {
/* save field - wait for new field */
parse_save_field(self);
self->state = START_FIELD;
}
else {
/* normal character - save in field */
parse_add_char(self, c);
}
break;
case IN_QUOTED_FIELD:
/* in quoted field */
if (c == '\0')
;
else if (c %(is_escape_char)s) {
/* Possible escape character */
self->state = ESCAPE_IN_QUOTED_FIELD;
}
else if (c == %(quotechar)d &&
%(quoting)d != QUOTE_NONE) {
if (%(doublequote)d) {
/* doublequote; " represented by "" */
self->state = QUOTE_IN_QUOTED_FIELD;
}
else {
/* end of quote part of field */
self->state = IN_FIELD;
}
}
else {
/* normal character - save in field */
parse_add_char(self, c);
}
break;
case ESCAPE_IN_QUOTED_FIELD:
if (c == '\0')
c = '\n';
parse_add_char(self, c);
self->state = IN_QUOTED_FIELD;
break;
case QUOTE_IN_QUOTED_FIELD:
/* doublequote - seen a quote in an quoted field */
if (%(quoting)d != QUOTE_NONE &&
c == %(quotechar)d) {
/* save "" as " */
parse_add_char(self, c);
self->state = IN_QUOTED_FIELD;
}
else if (c == %(delimiter)d) {
/* save field - wait for new field */
parse_save_field(self);
self->state = START_FIELD;
}
else if (c == '\n' || c == '\r' || c == '\0') {
/* end of line - return [fields] */
parse_save_field(self);
self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
}
else if (!%(strict)d) {
parse_add_char(self, c);
self->state = IN_FIELD;
}
else {
/* illegal */
/*PyErr_Format(error_obj, "'%%c' expected after '%%c'",
dialect->delimiter,
dialect->quotechar);*/
return -1;
}
break;
case EAT_CRNL:
if (c == '\n' || c == '\r')
;
else if (c == '\0')
self->state = START_RECORD;
else {
/*PyErr_Format(error_obj, "new-line character seen in unquoted field - do you need to open the file in universal-newline mode?");*/
return -1;
}
break;
}
return 0;
}
static void
parse_reset(ReaderObj *self, char *rawline)
{
self->field = rawline;
self->state = START_RECORD;
self->numeric_field = 0;
}
long parse_line(char *rawline, long inputlength)
{
char *p;
ReaderObj reader;
parse_reset(&reader, rawline);
for (p=rawline; inputlength > 0; inputlength--, p++) {
if (parse_process_char(&reader, *p) < 0)
return -1;
}
if (parse_process_char(&reader, 0) < 0)
return -1;
return reader.field - rawline - 1;
}
''' % d)
ffi.compile()
def fastcsv_reader(f, dialect_name):
try:
module = __import__('_fastcsv_' + dialect_name)
except ImportError:
_make_ffi_from_dialect(dialect_name)
module = __import__('_fastcsv_' + dialect_name)
ffi, lib = module.ffi, module.lib
#
linelen = -1
for line in f:
if linelen <= len(line):
linelen = 2 * len(line)
rawline = ffi.new("char[]", linelen)
ffi.buffer(rawline, len(line))[:] = line
n = lib.parse_line(rawline, len(line))
assert n >= 0
yield ffi.buffer(rawline, n)[:].split('\x00')
if __name__ == '__main__':
csv.register_dialect('unixpwd', delimiter=':', quoting=csv.QUOTE_NONE)
with open('/etc/passwd', 'rb') as f:
reader = fastcsv_reader(f, 'unixpwd')
for row in reader:
print row