I have a text file (there is no punctuation), the file size is about 100MB - 1GB, here is some example line:
please check in here
i have a full hd movie
see you again bye bye
press ctrl c to copy text to clipboard
i need your help
...
And with a list of replace tokens, like this:
check in -> check_in
full hd -> full_hd
bye bye -> bye_bye
ctrl c -> ctrl_c
...
And the output I want after replace on the text file like this:
please check_in here
i have a full_hd movie
see you again bye_bye
press ctrl_c to copy text to clipboard
i need your help
...
replace_tokens = {'ctrl c': 'ctrl_c', ...} # a python dictionary
for line in open('text_file'):
for token in replace_tokens:
line = re.sub(r'\b{}\b'.format(token), replace_tokens[token])
# Save line to file
This solution is working, but this is very slow for large number of replace tokens and large text file. Are there any better solution?
Use binary files and string replace as follows
Code
def process_binary(filename):
""" Replace strings using binary and string replace
Processing follows original code flow except using
binary files and string replace """
# Map using binary strings
replace_tokens = {b'ctrl c': b'ctrl_c', b'full hd': b'full_hd', b'bye bye': b'bye_bye', b'check in': b'check_in'}
outfile = append_id(filename, 'processed')
with open(filename, 'rb') as fi, open(outfile, 'wb') as fo:
for line in fi:
for token in replace_tokens:
line = line.replace(token, replace_tokens[token])
fo.write(line)
def append_id(filename, id):
" Convenience handler for generating name of output file "
return "{0}_{2}.{1}".format(*filename.rsplit('.', 1) + [id])
Performance Comparison
On 124 Mbyte File (generated by replicating posted string):
Current Solution:
General Trend
Test Code
# Generate Data by replicating posted string
s = """please check in here
i have a full hd movie
see you again bye bye
press ctrl c to copy text to clipboard
i need your help
"""
with open('test_data.txt', 'w') as fo:
for i in range(1000000): # Repeat string 1M times
fo.write(s)
# Time Posted Solution
from time import time
import re
def posted(filename):
replace_tokens = {'ctrl c': 'ctrl_c', 'full hd': 'full_hd', 'bye bye': 'bye_bye', 'check in': 'check_in'}
outfile = append_id(filename, 'posted')
with open(filename, 'r') as fi, open(outfile, 'w') as fo:
for line in fi:
for token in replace_tokens:
line = re.sub(r'\b{}\b'.format(token), replace_tokens[token], line)
fo.write(line)
def append_id(filename, id):
return "{0}_{2}.{1}".format(*filename.rsplit('.', 1) + [id])
t0 = time()
posted('test_data.txt')
print('Elapsed time: ', time() - t0)
# Elapsed time: 82.84100198745728
# Time Current Solution
from time import time
def process_binary(filename):
replace_tokens = {b'ctrl c': b'ctrl_c', b'full hd': b'full_hd', b'bye bye': b'bye_bye', b'check in': b'check_in'}
outfile = append_id(filename, 'processed')
with open(filename, 'rb') as fi, open(outfile, 'wb') as fo:
for line in fi:
for token in replace_tokens:
line = line.replace(token, replace_tokens[token])
fo.write(line)
def append_id(filename, id):
return "{0}_{2}.{1}".format(*filename.rsplit('.', 1) + [id])
t0 = time()
process_binary('test_data.txt')
print('Elapsed time: ', time() - t0)
# Elapsed time: 9.593998670578003
# Time Processing using Regex
# Avoiding inner loop--see dawg posted answer
import re
def process_regex(filename):
tokens={"check in":"check_in", "full hd":"full_hd",
"bye bye":"bye_bye","ctrl c":"ctrl_c"}
regex=re.compile("|".join([r"\b{}\b".format(t) for t in tokens]))
outfile = append_id(filename, 'regex')
with open(filename, 'r') as fi, open(outfile, 'w') as fo:
for line in fi:
line = regex.sub(lambda m: tokens[m.group(0)], line)
fo.write(line)
def append_id(filename, id):
return "{0}_{2}.{1}".format(*filename.rsplit('.', 1) + [id])
t0 = time()
process_regex('test_data.txt')
print('Elapsed time: ', time() - t0)
# Elapsed time: 28.27900242805481
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With