eutils_sample.py

#
# This is example for parsing XML input for PC204 Challenge 2.
# We provide working code for querying the EUtils ESearch service
# for publications of an author, parsing the XML output, and
# returning a list of PubMed IDs as strings.
#
# For Challenge 2, you will also need to query the EUtils
# ESummary service for details corresponding to PubMed IDs
# (in particular, the author list).  You will need to
# construct the query data and parse the XML output, just
# like 'get_pubs_for_author' below.  Constructing the
# query should be straightforward but parsing the output
# requires a bit of work.
#
# We suggest you implement a function 'get_pubs_for_ids'
# which takes a list of PubMed IDs (as strings) and returns
# a map whose keys are PubMed IDs and whose values are
# lists of author names.  It will send a single ESummary
# query and extract the author lists for all publications
# from the XML results.
#
# - ESummary has 10,000 id limit for PubMed queries, so you
#   may want to handle that in your function.
# - When processing many PubMed IDs, do NOT send one query
#   per publication.  That is slow on the query side and
#   places an unnecessary load on the EUtils servers.
#
# To parse ESummary results, you will need to implement
# a class similar to 'IDHandler' below, but it will need to
# deal with a few more XML tags.  The full ESummary XML format
# is somewhat complicated, but we can treat the output as
# essentially something like:
#
#   ... stuff we do not care about ...
#   <DocSum>
#     ... stuff we do not care about ...
#     <Id>PubMed ID</Id>
#     ... stuff we do not care about ...
#     <Item Name="Author">Author name</Item>
#     ... more Item elements for authors ...
#     ... stuff we do not care about ...
#   </DocSum>
#   ... more DocSum elements with same structure ...
#
# You may want to start with the 'IDHandler' code and change
# it handle the extra tags.  (For checking the 'Name' attribute
# in an '<Item>' element, you probably want something like:
#
#	def startElement(self, name, attrs):
#		...
#		if name == "Item" and attrs["Name"] == "Author":
#		...
#
# Hint: You will probably want to use the '<DocSum>' element
# opening and closing tags as markers for resetting and saving
# the data (PubMed ID and author list) for the current publication.
#
# For those who prefer full documentation, start with
#   https://docs.python.org/2/library/xml.sax.html
# which describes how the Python xml.sax module works.
# 

# ---------------------------------------------------------------------------

#
# ESearch URL for Entrez EUtils (http://www.ncbi.nlm.nih.gov/books/NBK25497/)
#
SearchURL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"

def get_pubs_for_author(author):
	"""Return list of PubMed ids for an author.
	
	'author' should be of form 'Surname Initials'
	where 'Initials' is a single string consisting
	of the first letter of all given names."""

	# Import the URL functions we will use
	try:
		# Python 2
		from urllib import urlopen, urlencode
	except ImportError:
		# Python 3
		from urllib.request import urlopen
		from urllib.parse import urlencode
	data = (
		("db", "pubmed"),
		("usehistory", "n"),
		("retmax", "100000"),
		("term", "%s[Author]" % author),
	)

	# Connect to EUtils service.  'f' is a file-like
	# object from which we will read the XML results.
	f = urlopen(SearchURL, urlencode(data).encode("ascii"))

	# Create our custom XML handler for extracting PubMed IDs
	id_extractor = IDHandler()

	# Read the XML file and return the extracted list of PubMed IDs
	from xml.sax import parse
	parse(f, id_extractor)
	return id_extractor.id_list

#
# The way XML parsing works is that we call
# 'xml.sax.parse' with the input XML stream
# and a 'handler' object.  'parse' reads the
# characters from the input stream and calls
# handler methods at opportune times with the
# relevant data.  For example, when an opening
# tag such as '<Id>' is read, 'parse' calls the
# handler 'startElement' method with the name of
# tag ('Id') and a map of attribute values (empty
# in this example).
#
# To read our specific XML data, eg ESearch
# results, we pass in a customized handler that
# squirrels away the text between '<Id>' and
# '</Id>' tags.  To do this, we create a subclass
# of the standard 'xml.sax.handler.ContentHandler'
# class and override some methods that are called
# by 'xml.sax.parse'.
#
# In fact, we create two subclasses: TextGrabber
# and IDHandler.  TextGrabber provides methods
# for grabbing text during XML processing and
# is intended to be further subclassed for
# specific XML formats.  IDHandler uses TextGrabber
# to save text of '<Id>' elements in a list.
#

from xml.sax.handler import ContentHandler
class TextGrabber(ContentHandler):
	"""Class for grabbing text from XML files."""

	def initGrab(self):
		"""Initialize text-grabbing state.
		
		Normally, this function is called from 'startDocument'."""

		self.grabbing = False

	def startGrab(self):
		"""Start saving text from XML files.
		
		Any previously saved text is discarded."""

		self.grabbing = True
		self.grab_content = ""

	def endGrab(self):
		"""Stop saving text.

		Returns text saved since the last call to 'startGrab'."""

		self.grabbing = False
		return self.grab_content

	# Override 'ContentHandler' method.
	def characters(self, content):
		# Save characters during a grab.
		if self.grabbing:
			self.grab_content += content

class IDHandler(TextGrabber):
	"""Class for extracting PubMed IDs from ESearch XML results."""

	# Override 'ContentHandler' method.
	def startDocument(self):
		# Initialize 'TextGrabber' state.
		self.initGrab()
		# We are not currently grabbing a PubMed ID. (see below)
		self.id_grab = False
		# Set list of saved PubMed IDs to empty list.
		self.id_list = []

	# Override 'ContentHandler' method.
	def startElement(self, name, attrs):
		# PubMed IDs are the text within 'Id' tags in
		# ESearch output, so we start grabbing IDs
		# whenever we see an opening tag "<Id>"
		if name == "Id":
			self.id_grab = True
			self.startGrab()

	# Override 'ContentHandler' method.
	def endElement(self, name):
		# When we see a closing tag "</Id>", we
		# check if we are grabbing an Id (we _should_
		# be, but it does not hurt to be sure).
		# If we are, terminate the grab and add the
		# saved text to our PubMed ID list.
		if name == "Id" and self.id_grab:
			self.id_list.append(self.endGrab())
			self.id_grab = False

if __name__ == "__main__":
	# Basic test program
	import pprint
	id_list = get_pubs_for_author("EL-SAMAD H")
	pprint.pprint(id_list)