LoadCSV.java

package edu.hawaii.ics.yucheng;

import java.io.StringReader;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;

/**
 * A class implements Runnable. The run method takes a list of CSV entries and
 * upload them into corresponding tables on the distributed database system.
 * 
 * @author Cheng Jade
 * @assignment ICS 421 Project
 * @date Mar 22, 2010
 * @bugs None
 */
public class LoadCSV implements Runnable {
    /* the create or drop configuration object. */
    public final LoadCSVConfiguration configuration;

    /* the catalog node extracted from the overall configuration. */
    public final ConfigurationNode catalog;

    /* a list of SQL commands to be executed. */
    private final ArrayList<String[]> csvs = new ArrayList<String[]>();

    /* a record of whether all CSV have been loaded successfully. */
    private Boolean success = null;

    /**
     * Initialize a new instance of this object
     */
    public LoadCSV(final LoadCSVConfiguration configuration, final ArrayList<String> rawCSV) throws ProgramException {
        if (null == configuration)
            throw new NullPointerException("configuration");
        if (null == rawCSV)
            throw new NullPointerException("rawCSV");

        // initialize configuration and catalog.
        this.configuration = configuration;
        this.catalog = this.configuration.catalog;

        // stitch rawCSV lines into one big string.
        final StringBuilder builder = new StringBuilder();
        for (final String item : rawCSV)
            builder.append(item + "\n");
        final StringReader reader = new StringReader(builder.toString());

        // parse the string and obtain all CSV entries and fields of each entry.
        String[] fields;
        while (null != (fields = CSVParser.parse(reader)))
            this.csvs.add(fields);
    }

    /**
     * The main routine that insert rows to corresponding nodes and update the
     * catalog dtables.
     */
    public void run() {
        try {
            // For each node, construct a list of CSVs that will be partitioned
            // to this node.
            final ArrayList<CSVsNodePair> csvsToNodes = new ArrayList<CSVsNodePair>();
            for (int i = 0; i < this.configuration.nodes.size(); i++)
                csvsToNodes.add(new CSVsNodePair(this.configuration.nodes.get(i).node));

            // Exam every CSV and decide which list this CSV belongs.
            final ArrayList<String> columnNames = this.columnNames();
            for (final String[] item : this.csvs) {
                final int toID = this.partitionTo(columnNames, item);
                for (final CSVsNodePair pair : csvsToNodes) {
                    if (Integer.parseInt(pair.node.name.substring(4)) == toID)
                        pair.nodeCSVs.add(item);
                }
            }

            // Create a bulk insert statement for each node and start a thread
            // on each node to execute this insert statement. Threads also
            // update the catalog dtables.
            final int count = csvsToNodes.size();
            final Thread[] threads = new Thread[count];
            for (int i = 0; i < count; i++) {
                final CSVsNodePair pair = csvsToNodes.get(i);
                final String bulkInsertStatement = this.bulkInsertStatement(pair);
                threads[i] = new Thread(new Runner(pair.node, bulkInsertStatement));
                threads[i].start();
            }

            // wait for threads.
            for (int i = 0; i < csvsToNodes.size(); i++)
                DistributedDB.join(threads[i], csvsToNodes.get(i).node);

            // Print message indicating if all commands completed successfully.
            if (this.success)
                System.out.println("All CSV were loaded successfully.");
            else
                System.out.println("Not all CSV were loaded successfully.");

        } catch (final ProgramException e) {
            System.err.println(e.getMessage());
            System.exit(1);
            return;
        }
    }

    /**
     * Connect to a node and retrieve the columns names of this table.
     * 
     * @return The list of column names in the target table.
     */
    private ArrayList<String> columnNames() throws ProgramException {

        final ConfigurationNode node = this.configuration.nodes.get(0).node;
        final ArrayList<String> columnNames = new ArrayList<String>();
        try {
            node.runStatement(new StatementRunner() {
                public void run(final Statement statement) throws ProgramException, SQLException {
                    final ResultSet set = statement.executeQuery("SELECT * FROM " + LoadCSV.this.configuration.tableName);
                    final ResultSetMetaData meta = set.getMetaData();
                    for (int i = 1; i <= meta.getColumnCount(); i++)
                        columnNames.add(meta.getColumnName(i));
                }
            });
            node.log(System.out, "Successfully connected and retrieved column names from a node db.");
            return columnNames;
        } catch (final Exception e) {
            node.log(System.err, e.getMessage());
            throw new ProgramException("columnNames " + e);
        }
    }

    /**
     * Calculates the node ID that a specified CSV entry will be partitioned to.
     * 
     * @param columnNames
     *            The list of columns names retrieved from target table.
     * 
     * @param csv
     *            The specified CSV entry to be calculated.
     * 
     * @return The node ID that this CSV entry will be partitioned to.
     * 
     * @throws ProgramException
     *             Thrown if there are any errors processing the CSV file.
     * @throws NullPointerException
     *             Thrown if the argument is null.
     */
    private int partitionTo(final ArrayList<String> columnNames, final String[] csv) throws ProgramException {
        if (null == columnNames)
            throw new NullPointerException("columnNames");
        if (null == csv)
            throw new NullPointerException("csv");

        int indexOfPartitionColumn = -1;
        final String partitionColumn = this.configuration.partitionColumn;
        for (int i = 0; i < columnNames.size(); i++) {
            if (columnNames.get(i).equalsIgnoreCase(partitionColumn)) {
                indexOfPartitionColumn = i;
                break;
            }
        }
        if (indexOfPartitionColumn == -1)
            throw new ProgramException("Mismatch partition table name and the dtables");
        final String partitionValue = csv[indexOfPartitionColumn];
        try {
            final int intPartitionValue = (int) Float.parseFloat(partitionValue);
            if (this.configuration.partitionMethod.equalsIgnoreCase("range")) {
                for (int i = 0; i < this.configuration.nodes.size(); i++) {
                    final int param1 = (int) Float.parseFloat(this.configuration.nodes.get(i).param1);
                    final int param2 = (int) Float.parseFloat(this.configuration.nodes.get(i).param2);
                    if (intPartitionValue > param1 && intPartitionValue <= param2)
                        return i + 1;
                }
                throw new ProgramException("Partition value out of range");
            }
            if (this.configuration.partitionMethod.equalsIgnoreCase("hash")) {
                final int param1 = Integer.parseInt(this.configuration.nodes.get(0).param1);
                return (intPartitionValue % param1) + 1;
            }
            throw new ProgramException("Support only range and hash partition methods.");
        } catch (final NumberFormatException e) {
            throw new ProgramException("Support only numeric partion values");
        }
    }

    /**
     * A private class that groups a node and its list of CSVs together.
     */
    private class CSVsNodePair {
        public final ConfigurationNode node;
        public final ArrayList<String[]> nodeCSVs = new ArrayList<String[]>();

        public CSVsNodePair(final ConfigurationNode node) {
            this.node = node;
        }
    }

    /**
     * Create and return a bulk insert statement.
     * 
     * @param pair
     *            A CSVsNodepair used to generate insert statement.
     * 
     * @return The bulk insert statement created.
     */
    private String bulkInsertStatement(final CSVsNodePair pair) {
        final StringBuilder builder = new StringBuilder();
        builder.append("INSERT INTO " + this.configuration.tableName + " VALUES \n");
        final int nodeNum = pair.nodeCSVs.size();
        for (int i = 0; i < nodeNum; i++) {
            final String[] csv = pair.nodeCSVs.get(i);
            builder.append("\t\t\t\t(");
            for (int j = 0; j < csv.length - 1; j++)
                builder.append(DistributedDB.quote(csv[j]) + ", ");
            builder.append(DistributedDB.quote(csv[csv.length - 1]) + ")");
            if (i != nodeNum - 1)
                builder.append(", \n");
        }
        return builder.toString();
    }

    /**
     * Create and return an update statement based on the partition information.
     * This SQL statement will be used to modify dtables.
     * 
     * @param partmtd
     *            An dtable entry, 1 for range, 2 for hash
     * 
     * @param partparam1
     *            An dtable entry, partition parameter 1
     * 
     * @param partparam2
     *            An dtable entry, partition parameter 2
     * 
     * @param node
     *            A ConfigurationNode to figure out the node ID entry of dtables
     * 
     * @return The the update statement
     */
    private String updateStatement(final String partmtd, final String partparam1, String partparam2, final ConfigurationNode node) {
        assert null != partmtd;
        assert null != partparam1;

        partparam2 = partparam2 == null ? "" : partparam2;

        final StringBuilder builder = new StringBuilder();
        builder.append("UPDATE DTABLES");
        builder.append(" SET PARTMTD = ");
        builder.append(DistributedDB.quote(partmtd));
        builder.append(", PARTCOL = ");
        builder.append(DistributedDB.quote(this.configuration.partitionColumn));
        builder.append(", PARTPARAM1 = ");
        builder.append(DistributedDB.quote(partparam1));
        builder.append(", PARTPARAM2 = ");
        builder.append(DistributedDB.quote(partparam2));
        builder.append(" WHERE (TNAME = ");
        builder.append(DistributedDB.quote(this.configuration.tableName));
        builder.append(" OR TNAME = UCASE(");
        builder.append(DistributedDB.quote(this.configuration.tableName) + "))");
        if (node == null)
            return builder.toString();
        builder.append(" AND NODEID = ");
        builder.append(DistributedDB.quote(node.name.substring(4)));
        return builder.toString();

    }

    /**
     * A private runner object.
     */
    private class Runner implements Runnable {

        /** The configuration node associated with a single thread. */
        private final ConfigurationNode node;

        /** The SQL statement to be executed. */
        private final String bulkInsertStatement;

        /**
         * Initializes a new instance of the RunSQL Runner.
         * 
         * @param node
         *            The cluster node associated with this instance.
         * 
         * @param command
         *            The command to execute.
         */
        public Runner(final ConfigurationNode node, final String bulkInsertStatement) {
            assert null != node;
            assert null != bulkInsertStatement;

            this.node = node;
            this.bulkInsertStatement = bulkInsertStatement;
        }

        /**
         * Executes the insert commands for the node associated with this
         * instance, and update the catalog datables when insertion finishes.
         */
        public void run() {
            try {
                // connect to the node and execute the bulk insert statement.
                this.node.runStatement(new StatementRunner() {
                    public void run(final Statement statement) throws ProgramException, SQLException {
                        Runner.this.node.log(System.out, "Executing: " + Runner.this.bulkInsertStatement);
                        statement.execute(Runner.this.bulkInsertStatement);
                        Runner.this.node.log(System.out, "Statement executed successfully.");
                    }
                });

                // connect to the catalog and update dtables.
                LoadCSV.this.catalog.runStatement(new StatementRunner() {
                    public void run(final Statement statement) throws ProgramException, SQLException {

                        // dispatch if the partition method is hash.
                        if (LoadCSV.this.configuration.partitionMethod.equalsIgnoreCase("hash")) {
                            final String updateStatement = LoadCSV.this.updateStatement("2", LoadCSV.this.configuration.nodes.get(0).param1, LoadCSV.this.configuration.nodes.get(0).param2, null);
                            // test code
                            LoadCSV.this.catalog.log(System.out, "Executing a update statement " + updateStatement);
                            statement.execute(updateStatement);
                            LoadCSV.this.catalog.log(System.out, "Updated for '" + Runner.this.node.hostname + "'");
                            if (LoadCSV.this.success == null)
                                LoadCSV.this.success = true;
                            return;
                        }

                        // dispatch if the partition method is range.
                        if (LoadCSV.this.configuration.partitionMethod.equalsIgnoreCase("range")) {
                            for (int i = 0; i < LoadCSV.this.configuration.nodes.size(); i++) {
                                final String updateStatement = LoadCSV.this.updateStatement("1", LoadCSV.this.configuration.nodes.get(i).param1, LoadCSV.this.configuration.nodes.get(i).param2, LoadCSV.this.configuration.nodes.get(i).node);
                                // test code
                                LoadCSV.this.catalog.log(System.out, "Executing a update statement " + updateStatement);
                                statement.execute(updateStatement);
                            }
                            LoadCSV.this.catalog.log(System.out, "Updated for '" + Runner.this.node.hostname + "'");
                            if (LoadCSV.this.success == null)
                                LoadCSV.this.success = true;
                            return;
                        }
                        // no other partition method is supported.
                        throw new ProgramException("Support only range and hash two partition methods.");
                    }
                });

            } catch (final ProgramException e) {
                LoadCSV.this.success = false;
                this.node.log(System.err, e.getMessage());
            }
        }
    }

    @Override
    public String toString() {
        return this.configuration.toString();
    }

}