#!/usr/bin/python

def process(filename, verbose=True):
	"Process ATOM records from PDB file"
	atoms = read_pdb(filename)
	numPhe, numTyr = count(atoms)
	if verbose:
		print "%s: %d PHE and %d TYR" % (filename, numPhe, numTyr)
	return numPhe, numTyr

def count(atoms):
	"Count the number of PHE and TYR residues"
	seen = set()
	numPhe = 0
	numTyr = 0
	for a in atoms:
		rid = (a[3], a[2])	# residue id = (sequence number, chain id)
		if rid in seen:
			continue
		seen.add(rid)
		if a[1] == "PHE":
			numPhe += 1
		elif a[1] == "TYR":
			numTyr += 1
	return numPhe, numTyr

def read_pdb(filename):
	"Read ATOM records from PDB file"
	f = file(filename)
	atoms = []
	for line in f:
		if not line.startswith("ATOM"):
			continue
		# ATOM records are fixed-format, so we cannot use whitespace for parsing
		if line[16] != ' ' and line[16] != 'A':	# Alternate location
			continue
		atom = (line[12:16],			# Atom name
				line[17:20].strip(),	# Residue type
				line[21],		# Chain id
				int(line[22:26]),	# Sequence number
				float(line[30:38]), float(line[38:46]), float(line[46:54]))
		atoms.append(atom)
	f.close()
	return atoms

if __name__ == "__main__":
	def count_all():
		import os
		files = os.listdir(".")
		for filename in files:
			if filename.endswith(".ent"):
				process(filename)
	import cProfile as profile
	from pstats import Stats
	profile.run("count_all()", "optimize_v2.stats")
	s = Stats("optimize_v2.stats")
	s.sort_stats("cumulative", "time", "calls")
	s.print_stats(5)