Biology: Genome Assembly
Cost of genome assembly has drastically decreased over the past decade as a result of advances in technology. One of the key innovations is an algorithmic one. Sequencing large segments of genome is prohibitively expensive but if we could take shorter overlapping sequences or reads of dna and reconstruct them, then we can automate more of the sequencing process.
.
Here we present one such algorithm where reads are deconstructed into a graph where a read (for example ACGCTCA) is represented as a edge between nodes ACGCTC --> CGCTCA. Then we traverse the graph using DFS to produce a possible reconstruction of the genome (note: there may be more than one plausible reconstruction)
#Read in data def read_adjacency_graph(input_file): data = [] with open(input_file, 'r') as infile: to_data = '' for ln in infile: if ln[0] == '>': data.append(to_data) to_data = '' else: to_data += ln.strip() data.append(to_data) return data def split_strand_into_kmers(strand, k): res = [] for i in xrange(len(strand) - k + 1): res.append(strand[i: i+k]) return res def format_adjacency_graph_from_kmers(read_file): data = {} for kmer in read_file: kmer = kmer.strip() if kmer[1:] not in data.get(kmer[:-1], []): data[kmer[:-1]] = data.get(kmer[:-1], []) + [kmer[1:]] return data def findStart(adjacency_graph): #Traverse through keys twice ... this can be expensive ... to find start reverse_index = {} #Construct reverse index (counts number of indegrees) for key in adjacency_graph.keys(): for item in adjacency_graph[key]: reverse_index[item] = reverse_index.get(item, 0) + 1 for key in adjacency_graph.keys(): if len(adjacency_graph.get(key, [])) > reverse_index.get(key, 0): return key return -1 def findEulerianPath(adjacency_graph, start): stack = [start] res = [] while(stack): if stack[-1] in adjacency_graph.keys() and adjacency_graph[stack[-1]] != []: stack.append(adjacency_graph[stack[-1]].pop()) else: res = [stack.pop()] + res return res def concatKmers(path): res = path.pop(0) for kmer in path: res += kmer[-1] return res def toOutputFile(to_output, output_file): #clear output file f2 = open(output_file, 'w+') f2.seek(0) f2.truncate() f2.write(to_output) #Weaves all functions into sequence def main(): read_file = read_adjacency_graph('sample_data.txt') data = [split_strand_into_kmers(strand, 6) for strand in read_file] data = reduce(lambda a, b: a + b, data) adj = format_adjacency_graph_from_kmers(data) start = findStart(adj) path = findEulerianPath(adj, start) path = concatKmers(path) toOutputFile(path, 'output.txt') main()