package edu.hawaii.ics.yucheng; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; /** * A static class that provides a method to parse records from a CSV file. * * @author Cheng Jade * @assignment ICS 421 Project * @date Feb 29, 2010 * @bugs None */ public final class CSVParser { /** * A main method to test the implementation. * * @param args * The command line arguments. */ public static void main(final String[] args) { assert null != args; // The the usage. if (args.length != 1) { System.out.println("Usage: CSVParser <path>"); System.out.println(" <path> the path to the csv file"); System.exit(0); return; } try { // Open the file for reading. assert null != args[0]; final Reader reader = new FileReader(args[0]); // Parse lines until the end of the file is reached. String[] row; while (null != (row = CSVParser.parse(reader))) { // Print each line parsed. System.out.print("{ "); for (final String field : row) System.out.print("[" + field + "] "); System.out.println("}"); } } catch (final Exception e) { System.err.println(e); } } /** Possible states of the CSV parser. */ private enum State { /** * Indicates the parser is looking for the first character in a field. * If the parser is in this state, it will add a field to the record * unless an error occurs. */ START, /** * Indicates the parser is parsing an unquoted string. */ CONTENT, /** * Indicates the parser is parsing a quoted string. */ IN_QUOTE, /** * Indicates the parser just read a double-quote in a quoted string. * This may or may not indicate the end of the string. If another * double-quote is read, then it is treated a double-quote is added to * the field, and the parser returns to the IN_QUOTE state. */ END_QUOTE, /** * Indicates the parser has finished parsing a quoted string and is * looking for a comma, end of line, or end of file. */ NEED_COMMA, /** * Indicates the parser has found the end of one record. */ TERMINAL } /** * Parses some input as CSV data. If successful, the method returns a * sequence of strings corresponding to the fields in the line. Otherwise, * the method returns null to indicate there is no more data available from * the reader. * * @param reader * The reader that contains the CSV data. * * @return A sequence of strings corresponding to the fields in the line. * * @throws ProgramException * Thrown if there are any errors processing the CSV file. * @throws NullPointerException * Thrown if the argument is null. */ public static String[] parse(final Reader reader) throws ProgramException { if (null == reader) throw new NullPointerException("reader"); State state = State.START; final ArrayList<String> fields = new ArrayList<String>(); final StringBuilder fieldBuilder = new StringBuilder(); // Loop until the parser enters the terminal state. while (state != State.TERMINAL) { // Read a character, and check for errors. final int ch; try { ch = reader.read(); } catch (final IOException e) { throw new ProgramException(e); } // Ignore carriage-returns. if (ch == '\r') continue; switch (state) { // ------------------------------------------------------------- case START: if (ch == -1 && fields.size() == 0) return null; if (ch == '\n' || ch == -1) { addUnquoted(fields, fieldBuilder); state = State.TERMINAL; } else if (ch == ',') addUnquoted(fields, fieldBuilder); else if (ch == '"') state = State.IN_QUOTE; else if (!Character.isWhitespace(ch)) { fieldBuilder.append((char) ch); state = State.CONTENT; } break; // ------------------------------------------------------------- case CONTENT: if (ch == '"') throw new ProgramException("Unexpected token: '\"'."); if (ch == '\n' || ch == -1) { addUnquoted(fields, fieldBuilder); state = State.TERMINAL; } else if (ch == ',') { addUnquoted(fields, fieldBuilder); state = State.START; } else fieldBuilder.append((char) ch); break; // ------------------------------------------------------------- case IN_QUOTE: if (ch == -1) throw new ProgramException("Unexpected end of line."); if (ch == '"') state = State.END_QUOTE; else fieldBuilder.append((char) ch); break; // ------------------------------------------------------------- case END_QUOTE: if (ch == -1 || ch == '\n') { addQuoted(fields, fieldBuilder); state = State.TERMINAL; } else if (ch == '"') { fieldBuilder.append((char) ch); state = State.IN_QUOTE; } else if (ch == ',') { addQuoted(fields, fieldBuilder); state = State.START; } else if (Character.isWhitespace(ch)) { addQuoted(fields, fieldBuilder); state = State.NEED_COMMA; } else { final String message = "Unexpected token: '" + (char) ch + "'."; throw new ProgramException(message); } break; // ------------------------------------------------------------- case NEED_COMMA: if (ch == -1 || ch == '\n') state = State.TERMINAL; else if (ch == ',') state = State.START; else if (!Character.isWhitespace(ch)) { final String message = "Unexpected token: '" + (char) ch + "'."; throw new ProgramException(message); } break; } } // Return the list as an array. final String[] array = new String[fields.size()]; fields.toArray(array); return array; } /** * Adds a field to the array of fields. The field is not trimmed of * whitespace, and the string builder is reset to zero length. * * @param fields * The array of fields. * * @param fieldBuilder * The string builder used to build the field. */ private static void addQuoted(final ArrayList<String> fields, final StringBuilder fieldBuilder) { assert null != fields; assert null != fieldBuilder; fields.add(fieldBuilder.toString()); fieldBuilder.setLength(0); } /** * Adds a field to the array of fields. The field is trimmed of whitespace, * and the string builder is reset to zero length. * * @param fields * The array of fields. * * @param fieldBuilder * The string builder used to build the field. */ private static void addUnquoted(final ArrayList<String> fields, final StringBuilder fieldBuilder) { assert null != fields; assert null != fieldBuilder; fields.add(fieldBuilder.toString().trim()); fieldBuilder.setLength(0); } }