Insert a newline character every 10 characters in a string using Julia

Tags:

I want to insert a newline character every 10 characters in a protein sequence :

seq="MSKNKSPLLNESEKMMSEMLPMKVSQSKLNYEEKVYIPTTIRNRKQHCFRRFFPYIALFQ"

In Perl, it is very easy :

$seq=~s/(.{10})/$1\n/g ; # does the job!

perl -e '$seq="MSKNKSPLLNESEKMMSEMLPMKVSQSKLNYEEKVYIPTTIRNRKQHCFRRFFPYIALFQ"; $seq=~s/(.{10})/$1\n/g; print $seq'
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ

In Julia,

replace(seq, r"(.{10})" , "\n")

does not work because I don't know a way to get the capture group (.{10}) and substitute it with itself + "\n"

julia> replace(seq, r"(.{10})" , "\n")
"\n\n\n\n\n\n"

So to do that, I need 2 steps :

    julia> a=matchall(r"(.{1,10})" ,seq)
    6-element Array{SubString{UTF8String},1}:
     "MSKNKSPLLN"
     "ESEKMMSEML"
     "PMKVSQSKLN"
     "YEEKVYIPTT"
     "IRNRKQHCFR"
     "RFFPYIALFQ"

    julia> b=join(a, "\n")
    "MSKNKSPLLN\nESEKMMSEML\nPMKVSQSKLN\nYEEKVYIPTT\nIRNRKQHCFR\nRFFPYIALFQ"

    julia> println(b)
    MSKNKSPLLN
    ESEKMMSEML
    PMKVSQSKLN
    YEEKVYIPTT
    IRNRKQHCFR
    RFFPYIALFQ

# Caution :    
a=matchall(r"(.{10})" ,seq) # wrong if seq is not exactly a multiple of 10 !

julia> seq
"MSKNKSPLLNESEKMMSEMLPMKVSQSKLNYEEKVYIPTTIRNRKQHCFRRFFPYIAL"

julia> matchall(r"(.{10})" ,seq)
5-element Array{SubString{UTF8String},1}:
"MSKNKSPLLN"
"ESEKMMSEML"
"PMKVSQSKLN"
"YEEKVYIPTT"
"IRNRKQHCFR"

julia> matchall(r"(.{1,10})" ,seq)
6-element Array{SubString{UTF8String},1}:
"MSKNKSPLLN"
"ESEKMMSEML"
"PMKVSQSKLN"
"YEEKVYIPTT"
"IRNRKQHCFR"
"RFFPYIAL"

Is there a one step solution or a better (faster) way?

Just for fun a benchmark with all these interesting answers ! (updated with julia 5.0)

function loop(a)
 last = 0
 #create the interval, in your case 10
 salt = 10
 #iterate in string (starts in the 10th value, don't forget julia use 1 to first index)
 for i in salt:salt+1:length(a)
    # replace the string for a new one with '\n'
    a = string(a[1:i], '\n', a[i+1:length(a)])
    last = Int64(i)
 end
 # replace the rest
 a = string(a[1:length(a) - last % salt + 1], '\n', a[length(a) - last % salt + 2:length(a)])
 println(a)
end

function regex1(seq)
  a=matchall(r"(.{1,10})" ,seq)
  b=join(a, "\n")
  println(b)
end

function regex2(seq)
  a=join(split(replace(seq, r"(.{10})", s"\1 ")), "\n")
  println(a)
end

function regex3(seq)
  a=replace(seq, r"(.{10})", Base.SubstitutionString("\\1\n"))
  a= chomp(a) # because there is a new line at the end
  println(a)
end

function intrapad(seq::String)
  buf = IOBuffer((length(seq)*11)>>3) # big enough buffer
  for i=1:10:length(seq)
    write(buf,SubString(seq,i,i+9),'\n')
  end
  #return
  print(takebuf_string(buf))
end

function join_substring(seq)
  a=join((SubString(seq,i,i+9) for i=1:10:length(seq)),'\n')
  println(a)
end

seq="MSKNKSPLLNESEKMMSEMLPMKVSQSKLNYEEKVYIPTTIRNRKQHCFRRFFPYIALFQ"

for i = 1:5
  println("loop :")
  @time loop(seq)
  println("regex1 :")
  @time regex1(seq)
  println("regex2 :")
  @time regex2(seq)
  println("regex3 :")
  @time regex3(seq)
  println("intrapad :")
  @time intrapad(seq)
  println("join substring :")
  @time join_substring(seq)
end

I changed the benchmark to execute 5 times @time and I post here the results after 5 execution of @time :

loop :
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIA
LFQ
  0.000013 seconds (53 allocations: 3.359 KB)
regex1 :
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ
  0.000013 seconds (49 allocations: 1.344 KB)
regex2 :
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ
  0.000017 seconds (47 allocations: 1.703 KB)
regex3 :
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ
  0.000013 seconds (31 allocations: 976 bytes)
intrapad :
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ
  0.000007 seconds (9 allocations: 608 bytes)
join substring :
MSKNKSPLLN
ESEKMMSEML
PMKVSQSKLN
YEEKVYIPTT
IRNRKQHCFR
RFFPYIALFQ
  0.000012 seconds (21 allocations: 800 bytes)

Intrapad is now first ;)

246

asked Nov 11 '16 10:11

2 Answers

I step by the required line-length through a range to format DNA sequences. In the following snippet, the step is 60.

last_x = 0
new_seq = ""
for x in 1:60:length(seq)
    if x+59 < length(seq[x:end])
        new_seq = join([new_seq, seq[x:(x+59)], "\n"])
    end
    last_x = x
end
new_seq = join([new_seq, seq[last_x:end], "\n"])

If seq is

ATTCGACTCTTATGCCTATCGCTAGCTAGCATCTATTCGACTCTTATGCCTATCGCTAGCTAGCATCAATTCGACTCTTATGCCTATCGCTAGCTAGCATCGATTCGACTCTTATGCCTATCGCTAGCTAGCATCCATTCGACTCTTATGCCTATCGCTAGCTAGCATCTATTCGACTCTTATGCCTATCGCTAGCTAGCATCAATTCGACTCTTATGCCTATCGCTAGCTAGCATCGATTCGACTCTTA

then, the printed output becomes

println(new_seq)
ATTCGACTCTTATGCCTATCGCTAGCTAGCATCTATTCGACTCTTATGCCTATCGCTAGC
TAGCATCAATTCGACTCTTATGCCTATCGCTAGCTAGCATCGATTCGACTCTTATGCCTA
TCGACTCTTA

answered Sep 23 '22 14:09

Like @daycaster suggests, you can use s"\1" as a replacement string to support capture groups. The trouble is that the special s"" string syntax doesn't support special characters like \n. You can get around this by manually constructing a SubstitutionString object, but then you need to escape the \ in \1:

julia> replace(seq, r"(.{10})", Base.SubstitutionString("\\1\n"))
"MSKNKSPLLN\nESEKMMSEML\nPMKVSQSKLN\nYEEKVYIPTT\nIRNRKQHCFR\nRFFPYIALFQ\n"

answered Sep 24 '22 14:09

mbauman

Related questions
                            
                                Python regex search for string at beginning of line in file
                            
                                Bug in Pattern.asPredicate?
                            
                                Remove text after the second space
                            
                                XML schema restriction pattern for not allowing empty strings
                            
                                Has anyone found that REGEX "\b" doesn't work in MYSQL?
                            
                                "preg_match(): Compilation failed: unmatched parentheses" in PHP for valid pattern
                            
                                Extract email and name with regex
                            
                                Python regex matching all but last occurrence
                            
                                Replace x with y or append y if no x
                            
                                Bash need to test for alphanumeric string
                            
                                How to order regular expression alternatives to get longest match?
                            
                                Javascript global match with capturing groups [duplicate]
                            
                                Javascript/Regex for finding just the root domain name without sub domains
                            
                                How to remove the second line of consecutive lines starting with the same word?
                            
                                Regular expressions: remove non alpha numerics, with an exception
                            
                                How to extract a substring pattern in Postgresql
                            
                                Find all matches in a string using regex [duplicate]
                            
                                Google Test - Test that a string does not contain a string
                            
                                Whats the difference between [\s\S]*? and .*? in Java regular expressions?
                            
                                How to get the parameter from a relative URL string in C#?

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

Insert a newline character every 10 characters in a string using Julia

Tags:

regex

replace

julia

Fred

People also ask

2 Answers

Eric Kofoid

mbauman

Recent Activity

Donate For Us