Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

csv parser in erlang

Tags:

csv

erlang

for my application i have to parse CSV file using Erlang.following is the code which will parse CSV using Erlang:-

parse_file(Fn) ->
{ok, Data} = file:read_file(Fn),
parse(binary_to_list(Data)).

parse(Data) -> lists:reverse(parse(Data, [])).

parse([], Acc) -> Acc;
parse(Data, Acc) ->
{Line, Tail} = parse_line(Data),
parse(Tail, [Line|Acc]).

parse_line(Data) ->
{Line, Tail} = parse_line(Data, []),
{lists:reverse(Line), Tail}.

parse_line([13,10|Data], Acc) -> {Acc, Data};
parse_line([10|Data], Acc) -> {Acc, Data};
parse_line([13|Data], Acc) -> {Acc, Data};
parse_line([], Acc) -> {Acc, []};
parse_line([$,,$,|Data], Acc) -> parse_line(Data, [""|Acc]);
parse_line([$,|Data], Acc) -> parse_line(Data, Acc);
parse_line(Data, Acc) ->
{Fld, Tail} = parse_field(Data),
parse_line(Tail, [Fld|Acc]).

parse_field([34|Data]) ->
{Fld, Tail} = parse_fieldq(Data, ""),
{lists:reverse(Fld), Tail};
parse_field(Data) ->
{Fld, Tail} = parse_field(Data, ""),
{lists:reverse(Fld), Tail}.

parse_field([$,|Tail], Acc) -> {Acc, [$,|Tail]};
parse_field([13|Tail], Acc) -> {Acc, [13|Tail]};
parse_field([10|Tail], Acc) -> {Acc, [10|Tail]};
parse_field([], Acc) -> {Acc, []};
parse_field([Ch|Tail], Acc) -> parse_field(Tail, [Ch|Acc]).

parse_fieldq([34,34|Tail], Acc) -> parse_fieldq(Tail, [34|Acc]);
parse_fieldq([34|Tail], Acc) -> {Acc, Tail};
parse_fieldq([Ch|Tail], Acc) -> parse_fieldq(Tail, [Ch|Acc]).

this code works fine but having two issues:- 1-since the code parse using double quote ("") and comma(,) and separate each value..but in following example if First name consist of double quote sting within it then the parser will create one more field.

"Type","First Name","Last Name","Email"
"Contact","Ashwani  Garg ------"All Pain Will End."","","[email protected]"

result:-
[["contact"],["Ashwani  Garg ------"],["All Pain Will End."],[],["[email protected]"]]

expected result:-
[["contact"],["Ashwani  Garg ------All Pain Will End."],[],["[email protected]"]]

2-for the following kind of csv its for value,its truncate some value:- First Name,Last Name,Middle Name,Name,Nickname,E-mail Address,Home Street,Home City,Home Postal Code,Home State,Home Country/Region,Home Phone,Home Fax,Mobile Phone,Personal Web Page,Business Street,Business City,Business Postal Code,Business State,Business Country/Region,Business Web Page,Business Phone,Business Fax,Pager,Company,Job Title,Department,Office Location,Notes

    Affection,,,Affection,,,,,,,,+919845141544,,+919845141544,,,,,,,,,,,,,,,
    result:-
    [["Affection"],[],[],["Affection"],[],[],[],[],[],[],[],["+919845141544"],[],["+919845141544"],[],[],[],[],[],[],[]]
    expected result:-
   [["Affection"],[],[],["Affection"],[],[],[],[],[],[],[],["+919845141544"],[],["+919845141544"],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]

Please help me ...for refernce please use the following link:- http://ppolv.wordpress.com/2008/02/25/parsing-csv-in-erlang/

like image 453
Abhimanyu Avatar asked Oct 07 '09 14:10

Abhimanyu


3 Answers

parse(File) ->
  {ok, F} = file:open(File, [read, raw]),
  parse(F, file:read_line(F), []).

parse(F, eof, Done) ->
  file:close(F),
  lists:reverse(Done);    

parse(F, Line, Done) ->
  parse(F, file:read_line(F), [parse_line(Line)|Done]).



parse_line(Line) -> parse_line(Line, []).

parse_line([], Fields) -> lists:reverse(Fields);
parse_line("," ++ Line, Fields) -> parse_field(Line, Fields);
parse_line(Line, Fields) -> parse_field(Line, Fields).

parse_field("\"" ++ Line, Fields) -> parse_field_q(Line, [], Fields);
parse_field(Line, Fields) -> parse_field(Line, [], Fields).

parse_field("," ++ _ = Line, Buf, Fields) -> parse_line(Line, [lists:reverse(Buf)|Fields]);
parse_field([C|Line], Buf, Fields) -> parse_field(Line, [C|Buf], Fields);
parse_field([], Buf, Fields) -> parse_line([], [lists:reverse(Buf)|Fields]).

parse_field_q(Line, Fields) -> parse_field_q(Line, [], Fields).
parse_field_q("\"\"" ++ Line, Buf, Fields) -> parse_field_q(Line, [$"|Buf], Fields);
parse_field_q("\"" ++ Line, Buf, Fields) -> parse_line(Line, [lists:reverse(Buf)|Fields]);
parse_field_q([C|Line], Buf, Fields) -> parse_field_q(Line, [C|Buf], Fields).

without file:read_line :

parse_file(File) ->
  {ok, Data} = file:read_file(File),
  parse(binary_to_list(Data), []).

parse([], Done) ->
  lists:reverse(Done);

parse(Data, Done) ->
  {Line, Rest} = case re:split(Data, "\r|\n|\r\n", [{return, list}, {parts, 2}]) of
                   [L,R] -> {L,R};
                   [L]   -> {L,[]}
                 end,
  parse(Rest, [parse_line(Line)|Done]).
like image 71
Zed Avatar answered Nov 10 '22 15:11

Zed


A side issue:

How are you creating the CSV input? It doesn't appear to be valid CSV (not that there is a particularly rigorous specification for CSV though).

Typically to use double quotes inside a CSV field they need to be escaped as a pair of double quotes, so your example would be:

"Type","First Name","Last Name","Email"
"Contact","Ashwani  Garg ------""All Pain Will End.""","","[email protected]"

This will import fine into open office spreadsheet, whereas your original example does not.

like image 2
Rob Charlton Avatar answered Nov 10 '22 14:11

Rob Charlton


I came across your implementation the other day and started playing around with it.

I made you a parser as well.

-module(csv_parser).

-export([parse_file/1]).

parse_file(File) ->
  {ok, Data} = file:read_file(File),
  parse(Data).

parse(Data) ->
    Lines = re:split(Data, "\r|\n|\r\n", [] ), 
    [ [begin
           case  re:split(Token, "\"", [] ) of 
               [_,T,_] -> T;
               [T] -> T; % if token is not surrounded by ""
               [] -> <<"">>
           end
       end || Token <- re:split(Line, ",", [] ) ] || Line <- Lines, Line =/= <<"">>].

I even wrote a small blogpost on this csv parser

like image 2
Martin Kristiansen Avatar answered Nov 10 '22 14:11

Martin Kristiansen