I have a tab delimited file containing regions and the respective biological entities found in these regions (I have checked for 67, hence you say each region was checked for the presence or absence of these 67 entities and their frequency).
I have all this data in a tabular format.
A sample data is given below
Region ATF3 BCL3 BCLAF1 BDP1 BRF1 BRF2 Brg1 CCNT2 CEBPB CHD2 CTCF CTCFL E2F6 ELF1
chr1:109102470:109102970 0 0 1 0 0 0 0 1 0 0 4 1 4 1
chr1:110526886:110527386 0 0 0 0 0 0 0 1 1 0 4 1 0 1
chr1:115300671:115301171 0 0 1 0 0 0 0 0 1 1 4 1 1 1
chr1:115323308:115323808 0 0 0 0 0 0 0 1 0 0 2 1 1 0
chr1:11795641:11796141 1 0 0 0 0 0 0 1 2 0 0 0 1 0
chr1:118148103:118148603 0 0 0 0 0 0 0 1 0 0 0 0 0 1
chr1:150521397:150521897 0 0 0 0 0 0 0 2 2 0 6 2 4 0
chr1:150601609:150602109 0 0 0 0 0 0 0 0 3 2 0 0 1 0
chr1:150602098:150602598 0 0 0 0 0 0 0 0 1 1 0 0 0 0
chr1:151119140:151119640 0 0 0 0 0 0 0 1 0 0 0 0 1 0
chr1:151128604:151129104 0 0 0 0 0 0 0 0 0 0 3 0 0 0
chr1:153517729:153518229 0 0 0 0 0 0 0 0 0 0 0 0 0 0
chr1:153962738:153963238 0 0 0 0 0 0 0 1 1 0 0 0 0 1
chr1:154155682:154156182 0 0 0 0 0 0 0 1 0 0 0 0 1 1
chr1:154155725:154156225 0 0 0 0 0 0 0 1 0 0 0 0 1 1
chr1:154192154:154192654 0 0 0 0 0 0 0 0 0 0 0 0 0 0
chr1:154192824:154193324 1 0 0 0 0 0 0 1 0 1 0 0 1 1
chr1:154192943:154193443 1 0 0 0 0 0 0 1 0 2 0 0 1 1
chr1:154193273:154193773 1 0 0 0 0 0 0 1 0 2 0 0 2 1
chr1:154193313:154193813 0 0 0 0 0 0 0 1 0 2 0 0 2 1
chr1:155904188:155904688 0 0 0 0 0 0 0 1 0 0 0 0 1 1
chr1:155947966:155948466 0 0 0 0 0 0 0 1 0 0 3 0 0 1
chr1:155948336:155948836 0 0 0 0 0 0 0 1 0 0 5 1 0 1
chr1:156023516:156024016 0 0 0 0 0 0 0 1 0 1 4 1 1 1
chr1:156024016:156024516 0 1 1 0 0 0 0 0 0 2 0 0 1 1
chr1:156163229:156163729 0 0 0 0 0 0 0 0 0 0 2 0 0 1
chr1:160990902:160991402 0 0 0 0 0 0 0 0 0 1 0 0 1 2
chr1:160991133:160991633 0 0 0 0 0 0 0 0 0 1 0 0 1 2
chr1:161474704:161475204 0 0 0 0 0 0 0 0 0 0 0 0 0 0
chr1:161509530:161510030 0 0 1 1 1 0 0 0 1 0 1 0 0 1
chr1:161590964:161591464 0 0 0 1 1 0 0 0 0 0 0 0 0 0
chr1:169075446:169075946 0 0 0 0 0 0 0 2 0 0 4 0 3 0
chr1:17053279:17053779 0 0 0 1 0 0 0 0 0 1 0 0 0 0
chr1:1709909:1710409 0 0 0 0 0 0 0 2 0 1 0 0 3 1
chr1:1710297:1710797 0 0 0 0 0 0 0 0 0 1 6 0 1 1
Now how can I put this in a heat map from colour code light red to dark red ( depending upon the frequency and white in case of absence)?
Is there any other better way to represent this type of data?
Due to the comments to my other answer OP had another question regarding the search of 2d clusters. Here is some answer.
Taken from my library eegpy, I use a method find_clusters. It performs a walk across a 2d-array, finding all clusters above / below a given threshold.
Here is my code:
import pylab as plt
import numpy as np
from Queue import Queue
def find_clusters(ar,thres,cmp_type="greater"):
"""For a given 2d-array (test statistic), find all clusters which
are above/below a certain threshold.
"""
if not cmp_type in ["lower","greater","abs_greater"]:
raise ValueError("cmp_type must be in [\"lower\",\"greater\",\"abs_greater\"]")
clusters = []
if cmp_type=="lower":
ar_in = (ar<thres).astype(np.bool)
elif cmp_type=="greater":
ar_in = (ar>thres).astype(np.bool)
else: #cmp_type=="abs_greater":
ar_in = (abs(ar)>thres).astype(np.bool)
already_visited = np.zeros(ar_in.shape,np.bool)
for i_s in range(ar_in.shape[0]): #i_s wie i_sample
for i_f in range(ar_in.shape[1]):
if not already_visited[i_s,i_f]:
if ar_in[i_s,i_f]:
#print "Anzahl cluster:", len(clusters)
mask = np.zeros(ar_in.shape,np.bool)
check_queue = Queue()
check_queue.put((i_s,i_f))
while not check_queue.empty():
pos_x,pos_y = check_queue.get()
if not already_visited[pos_x,pos_y]:
#print pos_x,pos_y
already_visited[pos_x,pos_y] = True
if ar_in[pos_x,pos_y]:
mask[pos_x,pos_y] = True
for coords in [(pos_x-1,pos_y),(pos_x+1,pos_y),(pos_x,pos_y-1),(pos_x,pos_y+1)]: #Direct Neighbors
if 0<=coords[0]<ar_in.shape[0] and 0<=coords[1]<ar_in.shape[1]:
check_queue.put(coords)
clusters.append(mask)
return clusters
fn = "14318737.txt"
with open(fn, "r") as f:
labels = f.readline().rstrip("\n").split()[1:]
data = np.loadtxt(fn, skiprows=1, converters={0:lambda x: 0})
clusters = find_clusters(data, 0, "greater")
plot_data = np.ma.masked_equal(data[:,1:], 0)
plt.subplots_adjust(left=0.1, bottom=0.15, right=0.99, top=0.95)
plt.imshow(plot_data, cmap=plt.cm.get_cmap("Reds"), interpolation="nearest", aspect = "auto",
vmin=0, extent=[0.5,plot_data.shape[1]+0.5, plot_data.shape[0] - 0.5, -0.5])
plt.colorbar()
for cl in clusters:
plt.contour(cl.astype(np.int),[0.5], colors="k", lw=2)
plt.xticks(np.arange(1, len(labels)+2), labels, rotation=90, va="top", ha="center")
plt.show()
which gives an image of the form:

clusters is a list of boolean 2d-arrays (True / False). Each arrray represents one cluster, where each boolean value indicates whether a specific "point" is part of this cluster. You can use it in any further analysis.
EDIT
Now with some more fun on the clusters
import pylab as plt
import numpy as np
from Queue import Queue
def find_clusters(ar,thres,cmp_type="greater"):
"""For a given 2d-array (test statistic), find all clusters which
are above/below a certain threshold.
"""
if not cmp_type in ["lower","greater","abs_greater"]:
raise ValueError("cmp_type must be in [\"lower\",\"greater\",\"abs_greater\"]")
clusters = []
if cmp_type=="lower":
ar_in = (ar<thres).astype(np.bool)
elif cmp_type=="greater":
ar_in = (ar>thres).astype(np.bool)
else: #cmp_type=="abs_greater":
ar_in = (abs(ar)>thres).astype(np.bool)
already_visited = np.zeros(ar_in.shape,np.bool)
for i_s in range(ar_in.shape[0]): #i_s wie i_sample
for i_f in range(ar_in.shape[1]):
if not already_visited[i_s,i_f]:
if ar_in[i_s,i_f]:
#print "Anzahl cluster:", len(clusters)
mask = np.zeros(ar_in.shape,np.bool)
check_queue = Queue()
check_queue.put((i_s,i_f))
while not check_queue.empty():
pos_x,pos_y = check_queue.get()
if not already_visited[pos_x,pos_y]:
#print pos_x,pos_y
already_visited[pos_x,pos_y] = True
if ar_in[pos_x,pos_y]:
mask[pos_x,pos_y] = True
for coords in [(pos_x-1,pos_y),(pos_x+1,pos_y),(pos_x,pos_y-1),(pos_x,pos_y+1)]: #Direct Neighbors
if 0<=coords[0]<ar_in.shape[0] and 0<=coords[1]<ar_in.shape[1]:
check_queue.put(coords)
clusters.append(mask)
return clusters
fn = "14318737.txt"
data = []
with open(fn, "r") as f:
labels = f.readline().rstrip("\n").split()[1:]
for line in f:
data.append([int(v) for v in line.split()[1:]])
data = np.array(data) #np.loadtxt(fn, skiprows=1, usecols=range(1,15))#converters={0:lambda x: 0})
clusters = find_clusters(data, 0, "greater")
large_clusters = filter(lambda cl: cl.sum()>5, clusters) #Only take clusters with five or more items
large_clusters = sorted(large_clusters, key=lambda cl: -cl.sum())
plot_data = np.ma.masked_equal(data[:,:], 0)
plt.subplots_adjust(left=0.1, bottom=0.15, right=0.99, top=0.95)
plt.imshow(plot_data, cmap=plt.cm.get_cmap("Reds"), interpolation="nearest", aspect = "auto",
vmin=0, extent=[-0.5,plot_data.shape[1]-0.5, plot_data.shape[0] - 0.5, -0.5])
plt.colorbar()
for cl in large_clusters:
plt.contour(cl.astype(np.int),[.5], colors="k", lw=2)
plt.xticks(np.arange(0, len(labels)+1), labels, rotation=90, va="top", ha="center")
print "Summary of all large clusters:\n"
print "#\tSize\tIn regions"
for i, cl in enumerate(large_clusters):
print "%i\t%i\t" % (i, cl.sum()),
regions_in_cluster = np.where(np.any(cl, axis=0))[0]
min_region = labels[min(regions_in_cluster)]
max_region = labels[max(regions_in_cluster)]
print "%s to %s" % (min_region, max_region)
plt.xlim(-0.5,plot_data.shape[1]-0.5)
plt.show()
I filter all clusters that have more than five points included. I plot only these. You could alternatively use the sum of data inside each cluster. I then sort these large clusters by their size, descending.
Finally, I print a summary of all large clusters, including the names of all clusters they
are across.

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With