for my application i have to parse CSV file using Erlang.following is the code which will parse CSV using Erlang:-
parse_file(Fn) ->
{ok, Data} = file:read_file(Fn),
parse(binary_to_list(Data)).
parse(Data) -> lists:reverse(parse(Data, [])).
parse([], Acc) -> Acc;
parse(Data, Acc) ->
{Line, Tail} = parse_line(Data),
parse(Tail, [Line|Acc]).
parse_line(Data) ->
{Line, Tail} = parse_line(Data, []),
{lists:reverse(Line), Tail}.
parse_line([13,10|Data], Acc) -> {Acc, Data};
parse_line([10|Data], Acc) -> {Acc, Data};
parse_line([13|Data], Acc) -> {Acc, Data};
parse_line([], Acc) -> {Acc, []};
parse_line([$,,$,|Data], Acc) -> parse_line(Data, [""|Acc]);
parse_line([$,|Data], Acc) -> parse_line(Data, Acc);
parse_line(Data, Acc) ->
{Fld, Tail} = parse_field(Data),
parse_line(Tail, [Fld|Acc]).
parse_field([34|Data]) ->
{Fld, Tail} = parse_fieldq(Data, ""),
{lists:reverse(Fld), Tail};
parse_field(Data) ->
{Fld, Tail} = parse_field(Data, ""),
{lists:reverse(Fld), Tail}.
parse_field([$,|Tail], Acc) -> {Acc, [$,|Tail]};
parse_field([13|Tail], Acc) -> {Acc, [13|Tail]};
parse_field([10|Tail], Acc) -> {Acc, [10|Tail]};
parse_field([], Acc) -> {Acc, []};
parse_field([Ch|Tail], Acc) -> parse_field(Tail, [Ch|Acc]).
parse_fieldq([34,34|Tail], Acc) -> parse_fieldq(Tail, [34|Acc]);
parse_fieldq([34|Tail], Acc) -> {Acc, Tail};
parse_fieldq([Ch|Tail], Acc) -> parse_fieldq(Tail, [Ch|Acc]).
this code works fine but having two issues:- 1-since the code parse using double quote ("") and comma(,) and separate each value..but in following example if First name consist of double quote sting within it then the parser will create one more field.
"Type","First Name","Last Name","Email"
"Contact","Ashwani Garg ------"All Pain Will End."","","[email protected]"
result:-
[["contact"],["Ashwani Garg ------"],["All Pain Will End."],[],["[email protected]"]]
expected result:-
[["contact"],["Ashwani Garg ------All Pain Will End."],[],["[email protected]"]]
2-for the following kind of csv its for value,its truncate some value:- First Name,Last Name,Middle Name,Name,Nickname,E-mail Address,Home Street,Home City,Home Postal Code,Home State,Home Country/Region,Home Phone,Home Fax,Mobile Phone,Personal Web Page,Business Street,Business City,Business Postal Code,Business State,Business Country/Region,Business Web Page,Business Phone,Business Fax,Pager,Company,Job Title,Department,Office Location,Notes
Affection,,,Affection,,,,,,,,+919845141544,,+919845141544,,,,,,,,,,,,,,,
result:-
[["Affection"],[],[],["Affection"],[],[],[],[],[],[],[],["+919845141544"],[],["+919845141544"],[],[],[],[],[],[],[]]
expected result:-
[["Affection"],[],[],["Affection"],[],[],[],[],[],[],[],["+919845141544"],[],["+919845141544"],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
Please help me ...for refernce please use the following link:- http://ppolv.wordpress.com/2008/02/25/parsing-csv-in-erlang/
parse(File) ->
{ok, F} = file:open(File, [read, raw]),
parse(F, file:read_line(F), []).
parse(F, eof, Done) ->
file:close(F),
lists:reverse(Done);
parse(F, Line, Done) ->
parse(F, file:read_line(F), [parse_line(Line)|Done]).
parse_line(Line) -> parse_line(Line, []).
parse_line([], Fields) -> lists:reverse(Fields);
parse_line("," ++ Line, Fields) -> parse_field(Line, Fields);
parse_line(Line, Fields) -> parse_field(Line, Fields).
parse_field("\"" ++ Line, Fields) -> parse_field_q(Line, [], Fields);
parse_field(Line, Fields) -> parse_field(Line, [], Fields).
parse_field("," ++ _ = Line, Buf, Fields) -> parse_line(Line, [lists:reverse(Buf)|Fields]);
parse_field([C|Line], Buf, Fields) -> parse_field(Line, [C|Buf], Fields);
parse_field([], Buf, Fields) -> parse_line([], [lists:reverse(Buf)|Fields]).
parse_field_q(Line, Fields) -> parse_field_q(Line, [], Fields).
parse_field_q("\"\"" ++ Line, Buf, Fields) -> parse_field_q(Line, [$"|Buf], Fields);
parse_field_q("\"" ++ Line, Buf, Fields) -> parse_line(Line, [lists:reverse(Buf)|Fields]);
parse_field_q([C|Line], Buf, Fields) -> parse_field_q(Line, [C|Buf], Fields).
without file:read_line :
parse_file(File) ->
{ok, Data} = file:read_file(File),
parse(binary_to_list(Data), []).
parse([], Done) ->
lists:reverse(Done);
parse(Data, Done) ->
{Line, Rest} = case re:split(Data, "\r|\n|\r\n", [{return, list}, {parts, 2}]) of
[L,R] -> {L,R};
[L] -> {L,[]}
end,
parse(Rest, [parse_line(Line)|Done]).
A side issue:
How are you creating the CSV input? It doesn't appear to be valid CSV (not that there is a particularly rigorous specification for CSV though).
Typically to use double quotes inside a CSV field they need to be escaped as a pair of double quotes, so your example would be:
"Type","First Name","Last Name","Email"
"Contact","Ashwani Garg ------""All Pain Will End.""","","[email protected]"
This will import fine into open office spreadsheet, whereas your original example does not.
I came across your implementation the other day and started playing around with it.
I made you a parser as well.
-module(csv_parser).
-export([parse_file/1]).
parse_file(File) ->
{ok, Data} = file:read_file(File),
parse(Data).
parse(Data) ->
Lines = re:split(Data, "\r|\n|\r\n", [] ),
[ [begin
case re:split(Token, "\"", [] ) of
[_,T,_] -> T;
[T] -> T; % if token is not surrounded by ""
[] -> <<"">>
end
end || Token <- re:split(Line, ",", [] ) ] || Line <- Lines, Line =/= <<"">>].
I even wrote a small blogpost on this csv parser
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With