Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Split dataframes according to a vector of positions

Tags:

dataframe

r

I want to split a dataframe into a list of 6 different-sized dataframes. The positions where to split I specify by a vector, posns below. I've tried using split but instead of the desired output I get 6 equally-sized dataframes.

How can I do this?

posns = c(4,50,68,81,90)

df1 = structure(list(chrom = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), snp_pos = c(948921L, 
949608L, 949654L, 1227249L, 1254841L, 1262966L, 1263144L, 1263362L, 
1288583L, 1455652L, 1571066L, 1571464L, 1571470L, 1571802L, 1599812L, 
1599888L, 1630271L, 1647814L, 1647814L, 1647871L, 1647871L, 1650787L, 
1650787L, 1650797L, 1650797L, 1650801L, 1650801L, 1650807L, 1650807L, 
1650845L, 1650845L, 1670432L, 1670432L, 1670432L, 1671087L, 1671087L, 
1671087L, 1683565L, 1683565L, 1683565L, 1684169L, 1684169L, 1684169L, 
1684472L, 1684472L, 1684472L, 1686040L, 1686040L, 1686040L, 1718435L, 
1718435L, 2125172L, 2441358L, 2488153L, 2490942L, 2494330L, 2494785L, 
3545250L, 3551792L, 6694574L, 6694927L, 6695331L, 7841330L, 8022824L, 
8412935L, 8412989L, 8413839L, 8425900L, 9811541L, 10218439L, 
10240094L, 10473196L, 10473200L, 10479791L, 10708142L, 11082919L, 
11114822L, 11114940L, 11132217L, 11736131L, 11810354L, 11847759L, 
11983206L, 11985396L, 12009956L, 12012753L, 12024235L, 12025648L, 
12071680L, 16890415L, 16890421L, 16890428L, 16890441L, 16890558L, 
16890559L, 16891333L, 16891340L, 16891365L, 16893721L, 16893736L
), Q.x = c(0.741961301980865, 1, 0.720109026807207, 0.000379926095791477, 
1, 0.569157762597131, 0.0448134555282655, 0.263705838768648, 
1, 3.9401608189424e-08, NA, NA, NA, NA, 0.141036658207429, 4.84068069656854e-08, 
4.43661413003932e-11, 0.916059828440023, 0.916059828440023, 0.659922962581594, 
0.659922962581594, 0.413553370535633, 0.413553370535633, 0.714246817533455, 
0.714246817533455, 0.721981775878533, 0.721981775878533, 1, 1, 
0.0014954358811119, 0.0014954358811119, 8.83093446255536e-14, 
8.83093446255536e-14, 8.83093446255536e-14, 0.281581364975761, 
0.281581364975761, 0.281581364975761, 1, 1, 1, 1, 1, 1, 0.0415833199080577, 
0.0415833199080577, 0.0415833199080577, 0.0446393461337085, 0.0446393461337085, 
0.0446393461337085, NA, NA, 0.0955715926532034, 0.538378452872325, 
0.0534014601577661, 0.335721613890647, 0.10791993889237, 0.856046745017246, 
0.0630351159601902, 0.00172714428632725, 0.440712852235607, 0.00599466402196809, 
0.0572560467887719, NA, NA, 4.15876549078e-05, NA, 0.0198308292795067, 
0.201292584136377, NA, 1, 0.227189739568257, 0.00172103054903301, 
0.0031569678468897, 0.112209415561467, 0.214802908052941, 5.08875303388692e-05, 
NA, NA, NA, NA, NA, 0.0165387785489721, 0.0124037431571059, 7.3978214204246e-34, 
0.326191223745559, NA, 0.0701742102840443, NA, 0.351069598560997, 
2.3479965234952e-12, 3.46177972593879e-06, 1, 0.0974541314547016, 
1, 7.50982175368481e-08, 0.000151416356355741, 1, 1.76165018835578e-17, 
3.10961711424869e-22, 8.29863562348751e-23), Q.y = c(NA, NA, 
NA, NA, NA, NA, NA, NA, 1, NA, 0.192489461231087, 0.00296682751485515, 
0.000175053346844423, 0.0013758526261836, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.160250237971167, 
0.817597720785312, NA, NA, NA, 0.926435352180301, NA, NA, 0.95226758057333, 
NA, NA, NA, NA, 0.646154538622465, 0.747932105441424, 0.539645992048171, 
1, 1, NA, NA, NA, NA, 1, 0.387507157909907, 0.827583128653863, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Q = c(NA, 1, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 
1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.988419768874236, 
NA, 0.05888784043377, 0.65213668882967, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("chrom", 
"snp_pos", "Q.x", "Q.y", "Q"), row.names = c(NA, 100L), class = "data.frame")
like image 968
Kaleb Avatar asked Dec 15 '22 11:12

Kaleb


1 Answers

Or a possible vectorized approach utilizing the findInterval function

res <- split(df1, findInterval(1:nrow(df1), posns + 1))

Validating results

lapply(res, dim)
# $`0`
# [1] 4 5
# 
# $`1`
# [1] 46  5
# 
# $`2`
# [1] 18  5
# 
# $`3`
# [1] 13  5
# 
# $`4`
# [1] 9 5
# 
# $`5`
# [1] 10  5
like image 53
David Arenburg Avatar answered Jan 06 '23 17:01

David Arenburg