Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to parse complex text files using Python?

I'm looking for a simple way of parsing complex text files into a pandas DataFrame. Below is a sample file, what I want the result to look like after parsing, and my current method.

Is there any way to make it more concise/faster/more pythonic/more readable?

I've also put this question on Code Review.

I eventually wrote a blog article to explain this to beginners.

Here is a sample file:

Sample text  A selection of students from Riverdale High and Hogwarts took part in a quiz. This is a record of their scores.  School = Riverdale High Grade = 1 Student number, Name 0, Phoebe 1, Rachel  Student number, Score 0, 3 1, 7  Grade = 2 Student number, Name 0, Angela 1, Tristan 2, Aurora  Student number, Score 0, 6 1, 3 2, 9  School = Hogwarts Grade = 1 Student number, Name 0, Ginny 1, Luna  Student number, Score 0, 8 1, 7  Grade = 2 Student number, Name 0, Harry 1, Hermione  Student number, Score 0, 5 1, 10  Grade = 3 Student number, Name 0, Fred 1, George  Student number, Score 0, 0 1, 0 

Here is what I want the result to look like after parsing:

                                         Name  Score School         Grade Student number                  Hogwarts       1     0                  Ginny      8                      1                   Luna      7                2     0                  Harry      5                      1               Hermione     10                3     0                   Fred      0                      1                 George      0 Riverdale High 1     0                 Phoebe      3                      1                 Rachel      7                2     0                 Angela      6                      1                Tristan      3                      2                 Aurora      9 

Here is how I currently parse it:

import re import pandas as pd   def parse(filepath):     """     Parse text at given filepath      Parameters     ----------     filepath : str         Filepath for file to be parsed      Returns     -------     data : pd.DataFrame         Parsed data      """      data = []     with open(filepath, 'r') as file:         line = file.readline()         while line:             reg_match = _RegExLib(line)              if reg_match.school:                 school = reg_match.school.group(1)              if reg_match.grade:                 grade = reg_match.grade.group(1)                 grade = int(grade)              if reg_match.name_score:                 value_type = reg_match.name_score.group(1)                 line = file.readline()                 while line.strip():                     number, value = line.strip().split(',')                     value = value.strip()                     dict_of_data = {                         'School': school,                         'Grade': grade,                         'Student number': number,                         value_type: value                     }                     data.append(dict_of_data)                     line = file.readline()              line = file.readline()          data = pd.DataFrame(data)         data.set_index(['School', 'Grade', 'Student number'], inplace=True)         # consolidate df to remove nans         data = data.groupby(level=data.index.names).first()         # upgrade Score from float to integer         data = data.apply(pd.to_numeric, errors='ignore')     return data   class _RegExLib:     """Set up regular expressions"""     # use https://regexper.com to visualise these if required     _reg_school = re.compile('School = (.*)\n')     _reg_grade = re.compile('Grade = (.*)\n')     _reg_name_score = re.compile('(Name|Score)')      def __init__(self, line):         # check whether line has a positive match with all of the regular expressions         self.school = self._reg_school.match(line)         self.grade = self._reg_grade.match(line)         self.name_score = self._reg_name_score.search(line)   if __name__ == '__main__':     filepath = 'sample.txt'     data = parse(filepath)     print(data) 
like image 524
bluprince13 Avatar asked Dec 26 '17 19:12

bluprince13


People also ask

Does Python have a parser?

In Python, there is a built-in module called parse which provides an interface between the Python internal parser and compiler, where this module allows the python program to edit the small fragments of code and create the executable program from this edited parse tree of python code.


1 Answers

Update 2019 (PEG parser):

This answer has received quite some attention so I felt to add another possibility, namely a parsing option. Here we could use a PEG parser instead (e.g. parsimonious) in combination with a NodeVisitor class:

from parsimonious.grammar import Grammar from parsimonious.nodes import NodeVisitor import pandas as pd grammar = Grammar(     r"""     schools         = (school_block / ws)+      school_block    = school_header ws grade_block+      grade_block     = grade_header ws name_header ws (number_name)+ ws score_header ws (number_score)+ ws?       school_header   = ~"^School = (.*)"m     grade_header    = ~"^Grade = (\d+)"m     name_header     = "Student number, Name"     score_header    = "Student number, Score"      number_name     = index comma name ws     number_score    = index comma score ws      comma           = ws? "," ws?      index           = number+     score           = number+      number          = ~"\d+"     name            = ~"[A-Z]\w+"     ws              = ~"\s*"     """ )  tree = grammar.parse(data)  class SchoolVisitor(NodeVisitor):     output, names = ([], [])     current_school, current_grade = None, None      def _getName(self, idx):         for index, name in self.names:             if index == idx:                 return name      def generic_visit(self, node, visited_children):         return node.text or visited_children      def visit_school_header(self, node, children):         self.current_school = node.match.group(1)      def visit_grade_header(self, node, children):         self.current_grade = node.match.group(1)         self.names = []      def visit_number_name(self, node, children):         index, name = None, None         for child in node.children:             if child.expr.name == 'name':                 name = child.text             elif child.expr.name == 'index':                 index = child.text          self.names.append((index, name))      def visit_number_score(self, node, children):         index, score = None, None         for child in node.children:             if child.expr.name == 'index':                 index = child.text             elif child.expr.name == 'score':                 score = child.text          name = self._getName(index)          # build the entire entry         entry = (self.current_school, self.current_grade, index, name, score)         self.output.append(entry)  sv = SchoolVisitor() sv.visit(tree)  df = pd.DataFrame.from_records(sv.output, columns = ['School', 'Grade', 'Student number', 'Name', 'Score']) print(df) 

Regex option (original answer)

Well then, watching Lord of the Rings the xth time, I had to bridge some time to the very finale:


Broken down, the idea is to split the problem up into several smaller problems:
  1. Separate each school
  2. ... each grade
  3. ... student and scores
  4. ... bind them together in a dataframe afterwards


The school part (see a demo on regex101.com)
^ School\s*=\s*(?P<school_name>.+) (?P<school_content>[\s\S]+?) (?=^School|\Z) 


The grade part (another demo on regex101.com)
^ Grade\s*=\s*(?P<grade>.+) (?P<students>[\s\S]+?) (?=^Grade|\Z) 


The student/score part (last demo on regex101.com):
^ Student\ number,\ Name[\n\r] (?P<student_names>(?:^\d+.+[\n\r])+) \s* ^ Student\ number,\ Score[\n\r] (?P<student_scores>(?:^\d+.+[\n\r])+) 

The rest is a generator expression which is then fed into the DataFrame constructor (along with the column names).


The code:
import pandas as pd, re  rx_school = re.compile(r'''     ^     School\s*=\s*(?P<school_name>.+)     (?P<school_content>[\s\S]+?)     (?=^School|\Z) ''', re.MULTILINE | re.VERBOSE)  rx_grade = re.compile(r'''     ^     Grade\s*=\s*(?P<grade>.+)     (?P<students>[\s\S]+?)     (?=^Grade|\Z) ''', re.MULTILINE | re.VERBOSE)  rx_student_score = re.compile(r'''     ^     Student\ number,\ Name[\n\r]     (?P<student_names>(?:^\d+.+[\n\r])+)     \s*     ^     Student\ number,\ Score[\n\r]     (?P<student_scores>(?:^\d+.+[\n\r])+) ''', re.MULTILINE | re.VERBOSE)   result = ((school.group('school_name'), grade.group('grade'), student_number, name, score)     for school in rx_school.finditer(string)     for grade in rx_grade.finditer(school.group('school_content'))     for student_score in rx_student_score.finditer(grade.group('students'))     for student in zip(student_score.group('student_names')[:-1].split("\n"), student_score.group('student_scores')[:-1].split("\n"))     for student_number in [student[0].split(", ")[0]]     for name in [student[0].split(", ")[1]]     for score in [student[1].split(", ")[1]] )  df = pd.DataFrame(result, columns = ['School', 'Grade', 'Student number', 'Name', 'Score']) print(df) 


Condensed:
rx_school = re.compile(r'^School\s*=\s*(?P<school_name>.+)(?P<school_content>[\s\S]+?)(?=^School|\Z)', re.MULTILINE) rx_grade = re.compile(r'^Grade\s*=\s*(?P<grade>.+)(?P<students>[\s\S]+?)(?=^Grade|\Z)', re.MULTILINE) rx_student_score = re.compile(r'^Student number, Name[\n\r](?P<student_names>(?:^\d+.+[\n\r])+)\s*^Student number, Score[\n\r](?P<student_scores>(?:^\d+.+[\n\r])+)', re.MULTILINE) 


This yields
            School Grade Student number      Name Score 0   Riverdale High     1              0    Phoebe     3 1   Riverdale High     1              1    Rachel     7 2   Riverdale High     2              0    Angela     6 3   Riverdale High     2              1   Tristan     3 4   Riverdale High     2              2    Aurora     9 5         Hogwarts     1              0     Ginny     8 6         Hogwarts     1              1      Luna     7 7         Hogwarts     2              0     Harry     5 8         Hogwarts     2              1  Hermione    10 9         Hogwarts     3              0      Fred     0 10        Hogwarts     3              1    George     0 


As for timing, this is the result running it a ten thousand times:
import timeit print(timeit.timeit(makedf, number=10**4)) # 11.918397722000009 s 

like image 149
Jan Avatar answered Sep 30 '22 12:09

Jan