1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768 |
- /**
- *
- * Copyright (c) behosoft Co.,Ltd.
- * All Rights Reserved.
- *
- * This software is the confidential and proprietary information of behosoft.
- * (Social Security Department). You shall not disclose such
- * Confidential Information and shall use it only in accordance with
- * the terms of the license agreement you entered into with behosoft.
- *
- * Distributable under GNU LGPL license by gnu.org
- */
- package com.behosoft.util;
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileNotFoundException;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.io.Reader;
- import java.io.StringReader;
- import java.nio.charset.Charset;
- import java.text.NumberFormat;
- import java.util.HashMap;
- /**
- * A stream based parser for parsing delimited text data from a file or a
- * stream.
- */
- public class CsvReader {
- private Reader inputStream = null;
- private String fileName = null;
- // this holds all the values for switches that the user is allowed to set
- private UserSettings userSettings = new UserSettings();
- private Charset charset = null;
- private boolean useCustomRecordDelimiter = false;
- // this will be our working buffer to hold data chunks
- // read in from the data file
- private DataBuffer dataBuffer = new DataBuffer();
- private ColumnBuffer columnBuffer = new ColumnBuffer();
- private RawRecordBuffer rawBuffer = new RawRecordBuffer();
- private boolean[] isQualified = null;
- private String rawRecord = "";
- private HeadersHolder headersHolder = new HeadersHolder();
- // these are all more or less global loop variables
- // to keep from needing to pass them all into various
- // methods during parsing
- private boolean startedColumn = false;
- private boolean startedWithQualifier = false;
- private boolean hasMoreData = true;
- private char lastLetter = '\0';
- private boolean hasReadNextLine = false;
- private int columnsCount = 0;
- private long currentRecord = 0;
- private String[] values = new String[StaticSettings.INITIAL_COLUMN_COUNT];
- private boolean initialized = false;
- private boolean closed = false;
- /**
- * Double up the text qualifier to represent an occurance of the text
- * qualifier.
- */
- public static final int ESCAPE_MODE_DOUBLED = 1;
- /**
- * Use a backslash character before the text qualifier to represent an
- * occurance of the text qualifier.
- */
- public static final int ESCAPE_MODE_BACKSLASH = 2;
- /**
- * Creates a {@link com.csvreader.CsvReader CsvReader} object using a file
- * as the data source.
- *
- * @param fileName
- * The path to the file to use as the data source.
- * @param delimiter
- * The character to use as the column delimiter.
- * @param charset
- * The {@link java.nio.charset.Charset Charset} to use while
- * parsing the data.
- */
- public CsvReader(String fileName, char delimiter, Charset charset)
- throws FileNotFoundException {
- if (fileName == null) {
- throw new IllegalArgumentException(
- "Parameter fileName can not be null.");
- }
- if (charset == null) {
- throw new IllegalArgumentException(
- "Parameter charset can not be null.");
- }
- if (!new File(fileName).exists()) {
- throw new FileNotFoundException("File " + fileName
- + " does not exist.");
- }
- this.fileName = fileName;
- this.userSettings.Delimiter = delimiter;
- this.charset = charset;
- isQualified = new boolean[values.length];
- }
- /**
- * Creates a {@link com.csvreader.CsvReader CsvReader} object using a file
- * as the data source. Uses ISO-8859-1 as the
- * {@link java.nio.charset.Charset Charset}.
- *
- * @param fileName
- * The path to the file to use as the data source.
- * @param delimiter
- * The character to use as the column delimiter.
- */
- public CsvReader(String fileName, char delimiter)
- throws FileNotFoundException {
- this(fileName, delimiter, Charset.forName("ISO-8859-1"));
- }
- /**
- * Creates a {@link com.csvreader.CsvReader CsvReader} object using a file
- * as the data source. Uses a comma as the column delimiter and
- * ISO-8859-1 as the {@link java.nio.charset.Charset Charset}.
- *
- * @param fileName
- * The path to the file to use as the data source.
- */
- public CsvReader(String fileName) throws FileNotFoundException {
- this(fileName, Letters.COMMA);
- }
- /**
- * Constructs a {@link com.csvreader.CsvReader CsvReader} object using a
- * {@link java.io.Reader Reader} object as the data source.
- *
- * @param inputStream
- * The stream to use as the data source.
- * @param delimiter
- * The character to use as the column delimiter.
- */
- public CsvReader(Reader inputStream, char delimiter) {
- if (inputStream == null) {
- throw new IllegalArgumentException(
- "Parameter inputStream can not be null.");
- }
- this.inputStream = inputStream;
- this.userSettings.Delimiter = delimiter;
- initialized = true;
- isQualified = new boolean[values.length];
- }
- /**
- * Constructs a {@link com.csvreader.CsvReader CsvReader} object using a
- * {@link java.io.Reader Reader} object as the data source. Uses a
- * comma as the column delimiter.
- *
- * @param inputStream
- * The stream to use as the data source.
- */
- public CsvReader(Reader inputStream) {
- this(inputStream, Letters.COMMA);
- }
- /**
- * Constructs a {@link com.csvreader.CsvReader CsvReader} object using an
- * {@link java.io.InputStream InputStream} object as the data source.
- *
- * @param inputStream
- * The stream to use as the data source.
- * @param delimiter
- * The character to use as the column delimiter.
- * @param charset
- * The {@link java.nio.charset.Charset Charset} to use while
- * parsing the data.
- */
- public CsvReader(InputStream inputStream, char delimiter, Charset charset) {
- this(new InputStreamReader(inputStream, charset), delimiter);
- }
- /**
- * Constructs a {@link com.csvreader.CsvReader CsvReader} object using an
- * {@link java.io.InputStream InputStream} object as the data
- * source. Uses a comma as the column delimiter.
- *
- * @param inputStream
- * The stream to use as the data source.
- * @param charset
- * The {@link java.nio.charset.Charset Charset} to use while
- * parsing the data.
- */
- public CsvReader(InputStream inputStream, Charset charset) {
- this(new InputStreamReader(inputStream, charset));
- }
- public boolean getCaptureRawRecord() {
- return userSettings.CaptureRawRecord;
- }
- public void setCaptureRawRecord(boolean captureRawRecord) {
- userSettings.CaptureRawRecord = captureRawRecord;
- }
- public String getRawRecord() {
- return rawRecord;
- }
- /**
- * Gets whether leading and trailing whitespace characters are being trimmed
- * from non-textqualified column data. Default is true.
- *
- * @return Whether leading and trailing whitespace characters are being
- * trimmed from non-textqualified column data.
- */
- public boolean getTrimWhitespace() {
- return userSettings.TrimWhitespace;
- }
- /**
- * Sets whether leading and trailing whitespace characters should be trimmed
- * from non-textqualified column data or not. Default is true.
- *
- * @param trimWhitespace
- * Whether leading and trailing whitespace characters should be
- * trimmed from non-textqualified column data or not.
- */
- public void setTrimWhitespace(boolean trimWhitespace) {
- userSettings.TrimWhitespace = trimWhitespace;
- }
- /**
- * Gets the character being used as the column delimiter. Default is comma,
- * ','.
- *
- * @return The character being used as the column delimiter.
- */
- public char getDelimiter() {
- return userSettings.Delimiter;
- }
- /**
- * Sets the character to use as the column delimiter. Default is comma, ','.
- *
- * @param delimiter
- * The character to use as the column delimiter.
- */
- public void setDelimiter(char delimiter) {
- userSettings.Delimiter = delimiter;
- }
- public char getRecordDelimiter() {
- return userSettings.RecordDelimiter;
- }
- /**
- * Sets the character to use as the record delimiter.
- *
- * @param recordDelimiter
- * The character to use as the record delimiter. Default is
- * combination of standard end of line characters for Windows,
- * Unix, or Mac.
- */
- public void setRecordDelimiter(char recordDelimiter) {
- useCustomRecordDelimiter = true;
- userSettings.RecordDelimiter = recordDelimiter;
- }
- /**
- * Gets the character to use as a text qualifier in the data.
- *
- * @return The character to use as a text qualifier in the data.
- */
- public char getTextQualifier() {
- return userSettings.TextQualifier;
- }
- /**
- * Sets the character to use as a text qualifier in the data.
- *
- * @param textQualifier
- * The character to use as a text qualifier in the data.
- */
- public void setTextQualifier(char textQualifier) {
- userSettings.TextQualifier = textQualifier;
- }
- /**
- * Whether text qualifiers will be used while parsing or not.
- *
- * @return Whether text qualifiers will be used while parsing or not.
- */
- public boolean getUseTextQualifier() {
- return userSettings.UseTextQualifier;
- }
- /**
- * Sets whether text qualifiers will be used while parsing or not.
- *
- * @param useTextQualifier
- * Whether to use a text qualifier while parsing or not.
- */
- public void setUseTextQualifier(boolean useTextQualifier) {
- userSettings.UseTextQualifier = useTextQualifier;
- }
- /**
- * Gets the character being used as a comment signal.
- *
- * @return The character being used as a comment signal.
- */
- public char getComment() {
- return userSettings.Comment;
- }
- /**
- * Sets the character to use as a comment signal.
- *
- * @param comment
- * The character to use as a comment signal.
- */
- public void setComment(char comment) {
- userSettings.Comment = comment;
- }
- /**
- * Gets whether comments are being looked for while parsing or not.
- *
- * @return Whether comments are being looked for while parsing or not.
- */
- public boolean getUseComments() {
- return userSettings.UseComments;
- }
- /**
- * Sets whether comments are being looked for while parsing or not.
- *
- * @param useComments
- * Whether comments are being looked for while parsing or not.
- */
- public void setUseComments(boolean useComments) {
- userSettings.UseComments = useComments;
- }
- /**
- * Gets the current way to escape an occurance of the text qualifier inside
- * qualified data.
- *
- * @return The current way to escape an occurance of the text qualifier
- * inside qualified data.
- */
- public int getEscapeMode() {
- return userSettings.EscapeMode;
- }
- /**
- * Sets the current way to escape an occurance of the text qualifier inside
- * qualified data.
- *
- * @param escapeMode
- * The way to escape an occurance of the text qualifier inside
- * qualified data.
- * @exception IllegalArgumentException
- * When an illegal value is specified for escapeMode.
- */
- public void setEscapeMode(int escapeMode) throws IllegalArgumentException {
- if (escapeMode != ESCAPE_MODE_DOUBLED
- && escapeMode != ESCAPE_MODE_BACKSLASH) {
- throw new IllegalArgumentException(
- "Parameter escapeMode must be a valid value.");
- }
- userSettings.EscapeMode = escapeMode;
- }
- public boolean getSkipEmptyRecords() {
- return userSettings.SkipEmptyRecords;
- }
- public void setSkipEmptyRecords(boolean skipEmptyRecords) {
- userSettings.SkipEmptyRecords = skipEmptyRecords;
- }
- /**
- * Safety caution to prevent the parser from using large amounts of memory
- * in the case where parsing settings like file encodings don't end up
- * matching the actual format of a file. This switch can be turned off if
- * the file format is known and tested. With the switch off, the max column
- * lengths and max column count per record supported by the parser will
- * greatly increase. Default is true.
- *
- * @return The current setting of the safety switch.
- */
- public boolean getSafetySwitch() {
- return userSettings.SafetySwitch;
- }
- /**
- * Safety caution to prevent the parser from using large amounts of memory
- * in the case where parsing settings like file encodings don't end up
- * matching the actual format of a file. This switch can be turned off if
- * the file format is known and tested. With the switch off, the max column
- * lengths and max column count per record supported by the parser will
- * greatly increase. Default is true.
- *
- * @param safetySwitch
- */
- public void setSafetySwitch(boolean safetySwitch) {
- userSettings.SafetySwitch = safetySwitch;
- }
- /**
- * Gets the count of columns found in this record.
- *
- * @return The count of columns found in this record.
- */
- public int getColumnCount() {
- return columnsCount;
- }
- /**
- * Gets the index of the current record.
- *
- * @return The index of the current record.
- */
- public long getCurrentRecord() {
- return currentRecord - 1;
- }
- /**
- * Gets the count of headers read in by a previous call to
- * {@link com.csvreader.CsvReader#readHeaders readHeaders()}.
- *
- * @return The count of headers read in by a previous call to
- * {@link com.csvreader.CsvReader#readHeaders readHeaders()}.
- */
- public int getHeaderCount() {
- return headersHolder.Length;
- }
- /**
- * Returns the header values as a string array.
- *
- * @return The header values as a String array.
- * @exception IOException
- * Thrown if this object has already been closed.
- */
- public String[] getHeaders() throws IOException {
- checkClosed();
- if (headersHolder.Headers == null) {
- return null;
- } else {
- // use clone here to prevent the outside code from
- // setting values on the array directly, which would
- // throw off the index lookup based on header name
- String[] clone = new String[headersHolder.Length];
- System.arraycopy(headersHolder.Headers, 0, clone, 0,
- headersHolder.Length);
- return clone;
- }
- }
- public void setHeaders(String[] headers) {
- headersHolder.Headers = headers;
- headersHolder.IndexByName.clear();
- if (headers != null) {
- headersHolder.Length = headers.length;
- } else {
- headersHolder.Length = 0;
- }
- // use headersHolder.Length here in case headers is null
- for (int i = 0; i < headersHolder.Length; i++) {
- headersHolder.IndexByName.put(headers[i], new Integer(i));
- }
- }
- public String[] getValues() throws IOException {
- checkClosed();
- // need to return a clone, and can't use clone because values.Length
- // might be greater than columnsCount
- String[] clone = new String[columnsCount];
- System.arraycopy(values, 0, clone, 0, columnsCount);
- return clone;
- }
- /**
- * Returns the current column value for a given column index.
- *
- * @param columnIndex
- * The index of the column.
- * @return The current column value.
- * @exception IOException
- * Thrown if this object has already been closed.
- */
- public String get(int columnIndex) throws IOException {
- checkClosed();
- if (columnIndex > -1 && columnIndex < columnsCount) {
- return values[columnIndex];
- } else {
- return "";
- }
- }
- /**
- * Returns the current column value for a given column header name.
- *
- * @param headerName
- * The header name of the column.
- * @return The current column value.
- * @exception IOException
- * Thrown if this object has already been closed.
- */
- public String get(String headerName) throws IOException {
- checkClosed();
- return get(getIndex(headerName));
- }
- /**
- * Creates a {@link com.csvreader.CsvReader CsvReader} object using a string
- * of data as the source. Uses ISO-8859-1 as the
- * {@link java.nio.charset.Charset Charset}.
- *
- * @param data
- * The String of data to use as the source.
- * @return A {@link com.csvreader.CsvReader CsvReader} object using the
- * String of data as the source.
- */
- public static CsvReader parse(String data) {
- if (data == null) {
- throw new IllegalArgumentException(
- "Parameter data can not be null.");
- }
- return new CsvReader(new StringReader(data));
- }
- /**
- * Reads another record.
- *
- * @return Whether another record was successfully read or not.
- * @exception IOException
- * Thrown if an error occurs while reading data from the
- * source stream.
- */
- public boolean readRecord() throws IOException {
- checkClosed();
- columnsCount = 0;
- rawBuffer.Position = 0;
- dataBuffer.LineStart = dataBuffer.Position;
- hasReadNextLine = false;
- // check to see if we've already found the end of data
- if (hasMoreData) {
- // loop over the data stream until the end of data is found
- // or the end of the record is found
- do {
- if (dataBuffer.Position == dataBuffer.Count) {
- checkDataLength();
- } else {
- startedWithQualifier = false;
- // grab the current letter as a char
- char currentLetter = dataBuffer.Buffer[dataBuffer.Position];
- if (userSettings.UseTextQualifier
- && currentLetter == userSettings.TextQualifier) {
- // this will be a text qualified column, so
- // we need to set startedWithQualifier to make it
- // enter the seperate branch to handle text
- // qualified columns
- lastLetter = currentLetter;
- // read qualified
- startedColumn = true;
- dataBuffer.ColumnStart = dataBuffer.Position + 1;
- startedWithQualifier = true;
- boolean lastLetterWasQualifier = false;
- char escapeChar = userSettings.TextQualifier;
- if (userSettings.EscapeMode == ESCAPE_MODE_BACKSLASH) {
- escapeChar = Letters.BACKSLASH;
- }
- boolean eatingTrailingJunk = false;
- boolean lastLetterWasEscape = false;
- boolean readingComplexEscape = false;
- int escape = ComplexEscape.UNICODE;
- int escapeLength = 0;
- char escapeValue = (char) 0;
- dataBuffer.Position++;
- do {
- if (dataBuffer.Position == dataBuffer.Count) {
- checkDataLength();
- } else {
- // grab the current letter as a char
- currentLetter = dataBuffer.Buffer[dataBuffer.Position];
- if (eatingTrailingJunk) {
- dataBuffer.ColumnStart = dataBuffer.Position + 1;
- if (currentLetter == userSettings.Delimiter) {
- endColumn();
- } else if ((!useCustomRecordDelimiter && (currentLetter == Letters.CR || currentLetter == Letters.LF))
- || (useCustomRecordDelimiter && currentLetter == userSettings.RecordDelimiter)) {
- endColumn();
- endRecord();
- }
- } else if (readingComplexEscape) {
- escapeLength++;
- switch (escape) {
- case ComplexEscape.UNICODE:
- escapeValue *= (char) 16;
- escapeValue += hexToDec(currentLetter);
- if (escapeLength == 4) {
- readingComplexEscape = false;
- }
- break;
- case ComplexEscape.OCTAL:
- escapeValue *= (char) 8;
- escapeValue += (char) (currentLetter - '0');
- if (escapeLength == 3) {
- readingComplexEscape = false;
- }
- break;
- case ComplexEscape.DECIMAL:
- escapeValue *= (char) 10;
- escapeValue += (char) (currentLetter - '0');
- if (escapeLength == 3) {
- readingComplexEscape = false;
- }
- break;
- case ComplexEscape.HEX:
- escapeValue *= (char) 16;
- escapeValue += hexToDec(currentLetter);
- if (escapeLength == 2) {
- readingComplexEscape = false;
- }
- break;
- }
- if (!readingComplexEscape) {
- appendLetter(escapeValue);
- } else {
- dataBuffer.ColumnStart = dataBuffer.Position + 1;
- }
- } else if (currentLetter == userSettings.TextQualifier) {
- if (lastLetterWasEscape) {
- lastLetterWasEscape = false;
- lastLetterWasQualifier = false;
- } else {
- updateCurrentValue();
- if (userSettings.EscapeMode == ESCAPE_MODE_DOUBLED) {
- lastLetterWasEscape = true;
- }
- lastLetterWasQualifier = true;
- }
- } else if (userSettings.EscapeMode == ESCAPE_MODE_BACKSLASH
- && lastLetterWasEscape) {
- switch (currentLetter) {
- case 'n':
- appendLetter(Letters.LF);
- break;
- case 'r':
- appendLetter(Letters.CR);
- break;
- case 't':
- appendLetter(Letters.TAB);
- break;
- case 'b':
- appendLetter(Letters.BACKSPACE);
- break;
- case 'f':
- appendLetter(Letters.FORM_FEED);
- break;
- case 'e':
- appendLetter(Letters.ESCAPE);
- break;
- case 'v':
- appendLetter(Letters.VERTICAL_TAB);
- break;
- case 'a':
- appendLetter(Letters.ALERT);
- break;
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- escape = ComplexEscape.OCTAL;
- readingComplexEscape = true;
- escapeLength = 1;
- escapeValue = (char) (currentLetter - '0');
- dataBuffer.ColumnStart = dataBuffer.Position + 1;
- break;
- case 'u':
- case 'x':
- case 'o':
- case 'd':
- case 'U':
- case 'X':
- case 'O':
- case 'D':
- switch (currentLetter) {
- case 'u':
- case 'U':
- escape = ComplexEscape.UNICODE;
- break;
- case 'x':
- case 'X':
- escape = ComplexEscape.HEX;
- break;
- case 'o':
- case 'O':
- escape = ComplexEscape.OCTAL;
- break;
- case 'd':
- case 'D':
- escape = ComplexEscape.DECIMAL;
- break;
- }
- readingComplexEscape = true;
- escapeLength = 0;
- escapeValue = (char) 0;
- dataBuffer.ColumnStart = dataBuffer.Position + 1;
- break;
- default:
- break;
- }
- lastLetterWasEscape = false;
- // can only happen for ESCAPE_MODE_BACKSLASH
- } else if (currentLetter == escapeChar) {
- updateCurrentValue();
- lastLetterWasEscape = true;
- } else {
- if (lastLetterWasQualifier) {
- if (currentLetter == userSettings.Delimiter) {
- endColumn();
- } else if ((!useCustomRecordDelimiter && (currentLetter == Letters.CR || currentLetter == Letters.LF))
- || (useCustomRecordDelimiter && currentLetter == userSettings.RecordDelimiter)) {
- endColumn();
- endRecord();
- } else {
- dataBuffer.ColumnStart = dataBuffer.Position + 1;
- eatingTrailingJunk = true;
- }
- // make sure to clear the flag for next
- // run of the loop
- lastLetterWasQualifier = false;
- }
- }
- // keep track of the last letter because we need
- // it for several key decisions
- lastLetter = currentLetter;
- if (startedColumn) {
- dataBuffer.Position++;
- if (userSettings.SafetySwitch
- && dataBuffer.Position
- - dataBuffer.ColumnStart
- + columnBuffer.Position > 100000) {
- close();
- throw new IOException(
- "Maximum column length of 100,000 exceeded in column "
- + NumberFormat
- .getIntegerInstance()
- .format(
- columnsCount)
- + " in record "
- + NumberFormat
- .getIntegerInstance()
- .format(
- currentRecord)
- + ". Set the SafetySwitch property to false"
- + " if you're expecting column lengths greater than 100,000 characters to"
- + " avoid this error.");
- }
- }
- } // end else
- } while (hasMoreData && startedColumn);
- } else if (currentLetter == userSettings.Delimiter) {
- // we encountered a column with no data, so
- // just send the end column
- lastLetter = currentLetter;
- endColumn();
- } else if (useCustomRecordDelimiter
- && currentLetter == userSettings.RecordDelimiter) {
- // this will skip blank lines
- if (startedColumn || columnsCount > 0
- || !userSettings.SkipEmptyRecords) {
- endColumn();
- endRecord();
- } else {
- dataBuffer.LineStart = dataBuffer.Position + 1;
- }
- lastLetter = currentLetter;
- } else if (!useCustomRecordDelimiter
- && (currentLetter == Letters.CR || currentLetter == Letters.LF)) {
- // this will skip blank lines
- if (startedColumn
- || columnsCount > 0
- || (!userSettings.SkipEmptyRecords && (currentLetter == Letters.CR || lastLetter != Letters.CR))) {
- endColumn();
- endRecord();
- } else {
- dataBuffer.LineStart = dataBuffer.Position + 1;
- }
- lastLetter = currentLetter;
- } else if (userSettings.UseComments && columnsCount == 0
- && currentLetter == userSettings.Comment) {
- // encountered a comment character at the beginning of
- // the line so just ignore the rest of the line
- lastLetter = currentLetter;
- skipLine();
- } else if (userSettings.TrimWhitespace
- && (currentLetter == Letters.SPACE || currentLetter == Letters.TAB)) {
- // do nothing, this will trim leading whitespace
- // for both text qualified columns and non
- startedColumn = true;
- dataBuffer.ColumnStart = dataBuffer.Position + 1;
- } else {
- // since the letter wasn't a special letter, this
- // will be the first letter of our current column
- startedColumn = true;
- dataBuffer.ColumnStart = dataBuffer.Position;
- boolean lastLetterWasBackslash = false;
- boolean readingComplexEscape = false;
- int escape = ComplexEscape.UNICODE;
- int escapeLength = 0;
- char escapeValue = (char) 0;
- boolean firstLoop = true;
- do {
- if (!firstLoop
- && dataBuffer.Position == dataBuffer.Count) {
- checkDataLength();
- } else {
- if (!firstLoop) {
- // grab the current letter as a char
- currentLetter = dataBuffer.Buffer[dataBuffer.Position];
- }
- if (!userSettings.UseTextQualifier
- && userSettings.EscapeMode == ESCAPE_MODE_BACKSLASH
- && currentLetter == Letters.BACKSLASH) {
- if (lastLetterWasBackslash) {
- lastLetterWasBackslash = false;
- } else {
- updateCurrentValue();
- lastLetterWasBackslash = true;
- }
- } else if (readingComplexEscape) {
- escapeLength++;
- switch (escape) {
- case ComplexEscape.UNICODE:
- escapeValue *= (char) 16;
- escapeValue += hexToDec(currentLetter);
- if (escapeLength == 4) {
- readingComplexEscape = false;
- }
- break;
- case ComplexEscape.OCTAL:
- escapeValue *= (char) 8;
- escapeValue += (char) (currentLetter - '0');
- if (escapeLength == 3) {
- readingComplexEscape = false;
- }
- break;
- case ComplexEscape.DECIMAL:
- escapeValue *= (char) 10;
- escapeValue += (char) (currentLetter - '0');
- if (escapeLength == 3) {
- readingComplexEscape = false;
- }
- break;
- case ComplexEscape.HEX:
- escapeValue *= (char) 16;
- escapeValue += hexToDec(currentLetter);
- if (escapeLength == 2) {
- readingComplexEscape = false;
- }
- break;
- }
- if (!readingComplexEscape) {
- appendLetter(escapeValue);
- } else {
- dataBuffer.ColumnStart = dataBuffer.Position + 1;
- }
- } else if (userSettings.EscapeMode == ESCAPE_MODE_BACKSLASH
- && lastLetterWasBackslash) {
- switch (currentLetter) {
- case 'n':
- appendLetter(Letters.LF);
- break;
- case 'r':
- appendLetter(Letters.CR);
- break;
- case 't':
- appendLetter(Letters.TAB);
- break;
- case 'b':
- appendLetter(Letters.BACKSPACE);
- break;
- case 'f':
- appendLetter(Letters.FORM_FEED);
- break;
- case 'e':
- appendLetter(Letters.ESCAPE);
- break;
- case 'v':
- appendLetter(Letters.VERTICAL_TAB);
- break;
- case 'a':
- appendLetter(Letters.ALERT);
- break;
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- escape = ComplexEscape.OCTAL;
- readingComplexEscape = true;
- escapeLength = 1;
- escapeValue = (char) (currentLetter - '0');
- dataBuffer.ColumnStart = dataBuffer.Position + 1;
- break;
- case 'u':
- case 'x':
- case 'o':
- case 'd':
- case 'U':
- case 'X':
- case 'O':
- case 'D':
- switch (currentLetter) {
- case 'u':
- case 'U':
- escape = ComplexEscape.UNICODE;
- break;
- case 'x':
- case 'X':
- escape = ComplexEscape.HEX;
- break;
- case 'o':
- case 'O':
- escape = ComplexEscape.OCTAL;
- break;
- case 'd':
- case 'D':
- escape = ComplexEscape.DECIMAL;
- break;
- }
- readingComplexEscape = true;
- escapeLength = 0;
- escapeValue = (char) 0;
- dataBuffer.ColumnStart = dataBuffer.Position + 1;
- break;
- default:
- break;
- }
- lastLetterWasBackslash = false;
- } else {
- if (currentLetter == userSettings.Delimiter) {
- endColumn();
- } else if ((!useCustomRecordDelimiter && (currentLetter == Letters.CR || currentLetter == Letters.LF))
- || (useCustomRecordDelimiter && currentLetter == userSettings.RecordDelimiter)) {
- endColumn();
- endRecord();
- }
- }
- // keep track of the last letter because we need
- // it for several key decisions
- lastLetter = currentLetter;
- firstLoop = false;
- if (startedColumn) {
- dataBuffer.Position++;
- if (userSettings.SafetySwitch
- && dataBuffer.Position
- - dataBuffer.ColumnStart
- + columnBuffer.Position > 100000) {
- close();
- throw new IOException(
- "Maximum column length of 100,000 exceeded in column "
- + NumberFormat
- .getIntegerInstance()
- .format(
- columnsCount)
- + " in record "
- + NumberFormat
- .getIntegerInstance()
- .format(
- currentRecord)
- + ". Set the SafetySwitch property to false"
- + " if you're expecting column lengths greater than 100,000 characters to"
- + " avoid this error.");
- }
- }
- } // end else
- } while (hasMoreData && startedColumn);
- }
- if (hasMoreData) {
- dataBuffer.Position++;
- }
- } // end else
- } while (hasMoreData && !hasReadNextLine);
- // check to see if we hit the end of the file
- // without processing the current record
- if (startedColumn || lastLetter == userSettings.Delimiter) {
- endColumn();
- endRecord();
- }
- }
- if (userSettings.CaptureRawRecord) {
- if (hasMoreData) {
- if (rawBuffer.Position == 0) {
- rawRecord = new String(dataBuffer.Buffer,
- dataBuffer.LineStart, dataBuffer.Position
- - dataBuffer.LineStart - 1);
- } else {
- rawRecord = new String(rawBuffer.Buffer, 0,
- rawBuffer.Position)
- + new String(dataBuffer.Buffer,
- dataBuffer.LineStart, dataBuffer.Position
- - dataBuffer.LineStart - 1);
- }
- } else {
- // for hasMoreData to ever be false, all data would have had to
- // have been
- // copied to the raw buffer
- rawRecord = new String(rawBuffer.Buffer, 0, rawBuffer.Position);
- }
- } else {
- rawRecord = "";
- }
- return hasReadNextLine;
- }
- /**
- * @exception IOException
- * Thrown if an error occurs while reading data from the
- * source stream.
- */
- private void checkDataLength() throws IOException {
- if (!initialized) {
- if (fileName != null) {
- inputStream = new BufferedReader(new InputStreamReader(
- new FileInputStream(fileName), charset),
- StaticSettings.MAX_FILE_BUFFER_SIZE);
- }
- charset = null;
- initialized = true;
- }
- updateCurrentValue();
- if (userSettings.CaptureRawRecord && dataBuffer.Count > 0) {
- if (rawBuffer.Buffer.length - rawBuffer.Position < dataBuffer.Count
- - dataBuffer.LineStart) {
- int newLength = rawBuffer.Buffer.length
- + Math.max(dataBuffer.Count - dataBuffer.LineStart,
- rawBuffer.Buffer.length);
- char[] holder = new char[newLength];
- System.arraycopy(rawBuffer.Buffer, 0, holder, 0,
- rawBuffer.Position);
- rawBuffer.Buffer = holder;
- }
- System.arraycopy(dataBuffer.Buffer, dataBuffer.LineStart,
- rawBuffer.Buffer, rawBuffer.Position, dataBuffer.Count
- - dataBuffer.LineStart);
- rawBuffer.Position += dataBuffer.Count - dataBuffer.LineStart;
- }
- try {
- dataBuffer.Count = inputStream.read(dataBuffer.Buffer, 0,
- dataBuffer.Buffer.length);
- } catch (IOException ex) {
- close();
- throw ex;
- }
- // if no more data could be found, set flag stating that
- // the end of the data was found
- if (dataBuffer.Count == -1) {
- hasMoreData = false;
- }
- dataBuffer.Position = 0;
- dataBuffer.LineStart = 0;
- dataBuffer.ColumnStart = 0;
- }
- /**
- * Read the first record of data as column headers.
- *
- * @return Whether the header record was successfully read or not.
- * @exception IOException
- * Thrown if an error occurs while reading data from the
- * source stream.
- */
- public boolean readHeaders() throws IOException {
- boolean result = readRecord();
- // copy the header data from the column array
- // to the header string array
- headersHolder.Length = columnsCount;
- headersHolder.Headers = new String[columnsCount];
- for (int i = 0; i < headersHolder.Length; i++) {
- String columnValue = get(i);
- headersHolder.Headers[i] = columnValue;
- // if there are duplicate header names, we will save the last one
- headersHolder.IndexByName.put(columnValue, new Integer(i));
- }
- if (result) {
- currentRecord--;
- }
- columnsCount = 0;
- return result;
- }
- /**
- * Returns the column header value for a given column index.
- *
- * @param columnIndex
- * The index of the header column being requested.
- * @return The value of the column header at the given column index.
- * @exception IOException
- * Thrown if this object has already been closed.
- */
- public String getHeader(int columnIndex) throws IOException {
- checkClosed();
- // check to see if we have read the header record yet
- // check to see if the column index is within the bounds
- // of our header array
- if (columnIndex > -1 && columnIndex < headersHolder.Length) {
- // return the processed header data for this column
- return headersHolder.Headers[columnIndex];
- } else {
- return "";
- }
- }
- public boolean isQualified(int columnIndex) throws IOException {
- checkClosed();
- if (columnIndex < columnsCount && columnIndex > -1) {
- return isQualified[columnIndex];
- } else {
- return false;
- }
- }
- /**
- * @exception IOException
- * Thrown if a very rare extreme exception occurs during
- * parsing, normally resulting from improper data format.
- */
- private void endColumn() throws IOException {
- String currentValue = "";
- // must be called before setting startedColumn = false
- if (startedColumn) {
- if (columnBuffer.Position == 0) {
- if (dataBuffer.ColumnStart < dataBuffer.Position) {
- int lastLetter = dataBuffer.Position - 1;
- if (userSettings.TrimWhitespace && !startedWithQualifier) {
- while (lastLetter >= dataBuffer.ColumnStart
- && (dataBuffer.Buffer[lastLetter] == Letters.SPACE || dataBuffer.Buffer[lastLetter] == Letters.TAB)) {
- lastLetter--;
- }
- }
- currentValue = new String(dataBuffer.Buffer,
- dataBuffer.ColumnStart, lastLetter
- - dataBuffer.ColumnStart + 1);
- }
- } else {
- updateCurrentValue();
- int lastLetter = columnBuffer.Position - 1;
- if (userSettings.TrimWhitespace && !startedWithQualifier) {
- while (lastLetter >= 0
- && (columnBuffer.Buffer[lastLetter] == Letters.SPACE || columnBuffer.Buffer[lastLetter] == Letters.SPACE)) {
- lastLetter--;
- }
- }
- currentValue = new String(columnBuffer.Buffer, 0,
- lastLetter + 1);
- }
- }
- columnBuffer.Position = 0;
- startedColumn = false;
- if (columnsCount >= 100000 && userSettings.SafetySwitch) {
- close();
- throw new IOException(
- "Maximum column count of 100,000 exceeded in record "
- + NumberFormat.getIntegerInstance().format(
- currentRecord)
- + ". Set the SafetySwitch property to false"
- + " if you're expecting more than 100,000 columns per record to"
- + " avoid this error.");
- }
- // check to see if our current holder array for
- // column chunks is still big enough to handle another
- // column chunk
- if (columnsCount == values.length) {
- // holder array needs to grow to be able to hold another column
- int newLength = values.length * 2;
- String[] holder = new String[newLength];
- System.arraycopy(values, 0, holder, 0, values.length);
- values = holder;
- boolean[] qualifiedHolder = new boolean[newLength];
- System.arraycopy(isQualified, 0, qualifiedHolder, 0,
- isQualified.length);
- isQualified = qualifiedHolder;
- }
- values[columnsCount] = currentValue;
- isQualified[columnsCount] = startedWithQualifier;
- currentValue = "";
- columnsCount++;
- }
- private void appendLetter(char letter) {
- if (columnBuffer.Position == columnBuffer.Buffer.length) {
- int newLength = columnBuffer.Buffer.length * 2;
- char[] holder = new char[newLength];
- System.arraycopy(columnBuffer.Buffer, 0, holder, 0,
- columnBuffer.Position);
- columnBuffer.Buffer = holder;
- }
- columnBuffer.Buffer[columnBuffer.Position++] = letter;
- dataBuffer.ColumnStart = dataBuffer.Position + 1;
- }
- private void updateCurrentValue() {
- if (startedColumn && dataBuffer.ColumnStart < dataBuffer.Position) {
- if (columnBuffer.Buffer.length - columnBuffer.Position < dataBuffer.Position
- - dataBuffer.ColumnStart) {
- int newLength = columnBuffer.Buffer.length
- + Math.max(
- dataBuffer.Position - dataBuffer.ColumnStart,
- columnBuffer.Buffer.length);
- char[] holder = new char[newLength];
- System.arraycopy(columnBuffer.Buffer, 0, holder, 0,
- columnBuffer.Position);
- columnBuffer.Buffer = holder;
- }
- System.arraycopy(dataBuffer.Buffer, dataBuffer.ColumnStart,
- columnBuffer.Buffer, columnBuffer.Position,
- dataBuffer.Position - dataBuffer.ColumnStart);
- columnBuffer.Position += dataBuffer.Position
- - dataBuffer.ColumnStart;
- }
- dataBuffer.ColumnStart = dataBuffer.Position + 1;
- }
- /**
- * @exception IOException
- * Thrown if an error occurs while reading data from the
- * source stream.
- */
- private void endRecord() throws IOException {
- // this flag is used as a loop exit condition
- // during parsing
- hasReadNextLine = true;
- currentRecord++;
- }
- /**
- * Gets the corresponding column index for a given column header name.
- *
- * @param headerName
- * The header name of the column.
- * @return The column index for the given column header name. Returns
- * -1 if not found.
- * @exception IOException
- * Thrown if this object has already been closed.
- */
- public int getIndex(String headerName) throws IOException {
- checkClosed();
- Object indexValue = headersHolder.IndexByName.get(headerName);
- if (indexValue != null) {
- return ((Integer) indexValue).intValue();
- } else {
- return -1;
- }
- }
- /**
- * Skips the next record of data by parsing each column. Does not
- * increment
- * {@link com.csvreader.CsvReader#getCurrentRecord getCurrentRecord()}.
- *
- * @return Whether another record was successfully skipped or not.
- * @exception IOException
- * Thrown if an error occurs while reading data from the
- * source stream.
- */
- public boolean skipRecord() throws IOException {
- checkClosed();
- boolean recordRead = false;
- if (hasMoreData) {
- recordRead = readRecord();
- if (recordRead) {
- currentRecord--;
- }
- }
- return recordRead;
- }
- /**
- * Skips the next line of data using the standard end of line characters and
- * does not do any column delimited parsing.
- *
- * @return Whether a line was successfully skipped or not.
- * @exception IOException
- * Thrown if an error occurs while reading data from the
- * source stream.
- */
- public boolean skipLine() throws IOException {
- checkClosed();
- // clear public column values for current line
- columnsCount = 0;
- boolean skippedLine = false;
- if (hasMoreData) {
- boolean foundEol = false;
- do {
- if (dataBuffer.Position == dataBuffer.Count) {
- checkDataLength();
- } else {
- skippedLine = true;
- // grab the current letter as a char
- char currentLetter = dataBuffer.Buffer[dataBuffer.Position];
- if (currentLetter == Letters.CR
- || currentLetter == Letters.LF) {
- foundEol = true;
- }
- // keep track of the last letter because we need
- // it for several key decisions
- lastLetter = currentLetter;
- if (!foundEol) {
- dataBuffer.Position++;
- }
- } // end else
- } while (hasMoreData && !foundEol);
- columnBuffer.Position = 0;
- dataBuffer.LineStart = dataBuffer.Position + 1;
- }
- rawBuffer.Position = 0;
- rawRecord = "";
- return skippedLine;
- }
- /**
- * Closes and releases all related resources.
- */
- public void close() {
- if (!closed) {
- close(true);
- closed = true;
- }
- }
- /**
- *
- */
- private void close(boolean closing) {
- if (!closed) {
- if (closing) {
- charset = null;
- headersHolder.Headers = null;
- headersHolder.IndexByName = null;
- dataBuffer.Buffer = null;
- columnBuffer.Buffer = null;
- rawBuffer.Buffer = null;
- }
- try {
- if (initialized) {
- inputStream.close();
- }
- } catch (Exception e) {
- // just eat the exception
- }
- inputStream = null;
- closed = true;
- }
- }
- /**
- * @exception IOException
- * Thrown if this object has already been closed.
- */
- private void checkClosed() throws IOException {
- if (closed) {
- throw new IOException(
- "This instance of the CsvReader class has already been closed.");
- }
- }
- /**
- *
- */
- protected void finalize() {
- close(false);
- }
- private class ComplexEscape {
- private static final int UNICODE = 1;
- private static final int OCTAL = 2;
- private static final int DECIMAL = 3;
- private static final int HEX = 4;
- }
- private static char hexToDec(char hex) {
- char result;
- if (hex >= 'a') {
- result = (char) (hex - 'a' + 10);
- } else if (hex >= 'A') {
- result = (char) (hex - 'A' + 10);
- } else {
- result = (char) (hex - '0');
- }
- return result;
- }
- private class DataBuffer {
- public char[] Buffer;
- public int Position;
- // / <summary>
- // / How much usable data has been read into the stream,
- // / which will not always be as long as Buffer.Length.
- // / </summary>
- public int Count;
- // / <summary>
- // / The position of the cursor in the buffer when the
- // / current column was started or the last time data
- // / was moved out to the column buffer.
- // / </summary>
- public int ColumnStart;
- public int LineStart;
- public DataBuffer() {
- Buffer = new char[StaticSettings.MAX_BUFFER_SIZE];
- Position = 0;
- Count = 0;
- ColumnStart = 0;
- LineStart = 0;
- }
- }
- private class ColumnBuffer {
- public char[] Buffer;
- public int Position;
- public ColumnBuffer() {
- Buffer = new char[StaticSettings.INITIAL_COLUMN_BUFFER_SIZE];
- Position = 0;
- }
- }
- private class RawRecordBuffer {
- public char[] Buffer;
- public int Position;
- public RawRecordBuffer() {
- Buffer = new char[StaticSettings.INITIAL_COLUMN_BUFFER_SIZE
- * StaticSettings.INITIAL_COLUMN_COUNT];
- Position = 0;
- }
- }
- private class Letters {
- public static final char LF = '\n';
- public static final char CR = '\r';
- public static final char QUOTE = '"';
- public static final char COMMA = ',';
- public static final char SPACE = ' ';
- public static final char TAB = '\t';
- public static final char POUND = '#';
- public static final char BACKSLASH = '\\';
- public static final char NULL = '\0';
- public static final char BACKSPACE = '\b';
- public static final char FORM_FEED = '\f';
- public static final char ESCAPE = '\u001B'; // ASCII/ANSI escape
- public static final char VERTICAL_TAB = '\u000B';
- public static final char ALERT = '\u0007';
- }
- private class UserSettings {
- // having these as publicly accessible members will prevent
- // the overhead of the method call that exists on properties
- public boolean CaseSensitive;
- public char TextQualifier;
- public boolean TrimWhitespace;
- public boolean UseTextQualifier;
- public char Delimiter;
- public char RecordDelimiter;
- public char Comment;
- public boolean UseComments;
- public int EscapeMode;
- public boolean SafetySwitch;
- public boolean SkipEmptyRecords;
- public boolean CaptureRawRecord;
- public UserSettings() {
- CaseSensitive = true;
- TextQualifier = Letters.QUOTE;
- TrimWhitespace = true;
- UseTextQualifier = true;
- Delimiter = Letters.COMMA;
- RecordDelimiter = Letters.NULL;
- Comment = Letters.POUND;
- UseComments = false;
- EscapeMode = CsvReader.ESCAPE_MODE_DOUBLED;
- SafetySwitch = true;
- SkipEmptyRecords = true;
- CaptureRawRecord = true;
- }
- }
- private class HeadersHolder {
- public String[] Headers;
- public int Length;
- public HashMap IndexByName;
- public HeadersHolder() {
- Headers = null;
- Length = 0;
- IndexByName = new HashMap();
- }
- }
- private class StaticSettings {
- // these are static instead of final so they can be changed in unit test
- // isn't visible outside this class and is only accessed once during
- // CsvReader construction
- public static final int MAX_BUFFER_SIZE = 1024;
- public static final int MAX_FILE_BUFFER_SIZE = 4 * 1024;
- public static final int INITIAL_COLUMN_COUNT = 10;
- public static final int INITIAL_COLUMN_BUFFER_SIZE = 50;
- }
- }
|