#
# This is example for parsing XML input for PC204 Challenge 2.
# We provide working code for querying the EUtils ESearch service
# for publications of an author, parsing the XML output, and
# returning a list of PubMed IDs as strings.
#
# For Challenge 2, you will also need to query the EUtils
# ESummary service for details corresponding to PubMed IDs
# (in particular, the author list). You will need to
# construct the query data and parse the XML output, just
# like 'get_pubs_for_author' below. Constructing the
# query should be straightforward but parsing the output
# requires a bit of work.
#
# We suggest you implement a function 'get_pubs_for_ids'
# which takes a list of PubMed IDs (as strings) and returns
# a map whose keys are PubMed IDs and whose values are
# lists of author names. It will send a single ESummary
# query and extract the author lists for all publications
# from the XML results.
#
# - ESummary has 10,000 id limit for PubMed queries, so you
# may want to handle that in your function.
# - When processing many PubMed IDs, do NOT send one query
# per publication. That is slow on the query side and
# places an unnecessary load on the EUtils servers.
#
# To parse ESummary results, you will need to implement
# a class similar to 'IDHandler' below, but it will need to
# deal with a few more XML tags. The full ESummary XML format
# is somewhat complicated, but we can treat the output as
# essentially something like:
#
# ... stuff we do not care about ...
# <DocSum>
# ... stuff we do not care about ...
# <Id>PubMed ID</Id>
# ... stuff we do not care about ...
# <Item Name="Author">Author name</Item>
# ... more Item elements for authors ...
# ... stuff we do not care about ...
# </DocSum>
# ... more DocSum elements with same structure ...
#
# You may want to start with the 'IDHandler' code and change
# it handle the extra tags. (For checking the 'Name' attribute
# in an '<Item>' element, you probably want something like:
#
# def startElement(self, name, attrs):
# ...
# if name == "Item" and attrs["Name"] == "Author":
# ...
#
# Hint: You will probably want to use the '<DocSum>' element
# opening and closing tags as markers for resetting and saving
# the data (PubMed ID and author list) for the current publication.
#
# For those who prefer full documentation, start with
# https://docs.python.org/2/library/xml.sax.html
# which describes how the Python xml.sax module works.
#
# ---------------------------------------------------------------------------
#
# ESearch URL for Entrez EUtils (http://www.ncbi.nlm.nih.gov/books/NBK25497/)
#
SearchURL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
def get_pubs_for_author(author):
"""Return list of PubMed ids for an author.
'author' should be of form 'Surname Initials'
where 'Initials' is a single string consisting
of the first letter of all given names."""
# Import the URL functions we will use
try:
# Python 2
from urllib import urlopen, urlencode
except ImportError:
# Python 3
from urllib.request import urlopen
from urllib.parse import urlencode
data = (
("db", "pubmed"),
("usehistory", "n"),
("retmax", "100000"),
("term", "%s[Author]" % author),
)
# Connect to EUtils service. 'f' is a file-like
# object from which we will read the XML results.
f = urlopen(SearchURL, urlencode(data).encode("ascii"))
# Create our custom XML handler for extracting PubMed IDs
id_extractor = IDHandler()
# Read the XML file and return the extracted list of PubMed IDs
from xml.sax import parse
parse(f, id_extractor)
return id_extractor.id_list
#
# The way XML parsing works is that we call
# 'xml.sax.parse' with the input XML stream
# and a 'handler' object. 'parse' reads the
# characters from the input stream and calls
# handler methods at opportune times with the
# relevant data. For example, when an opening
# tag such as '<Id>' is read, 'parse' calls the
# handler 'startElement' method with the name of
# tag ('Id') and a map of attribute values (empty
# in this example).
#
# To read our specific XML data, eg ESearch
# results, we pass in a customized handler that
# squirrels away the text between '<Id>' and
# '</Id>' tags. To do this, we create a subclass
# of the standard 'xml.sax.handler.ContentHandler'
# class and override some methods that are called
# by 'xml.sax.parse'.
#
# In fact, we create two subclasses: TextGrabber
# and IDHandler. TextGrabber provides methods
# for grabbing text during XML processing and
# is intended to be further subclassed for
# specific XML formats. IDHandler uses TextGrabber
# to save text of '<Id>' elements in a list.
#
from xml.sax.handler import ContentHandler
class TextGrabber(ContentHandler):
"""Class for grabbing text from XML files."""
def initGrab(self):
"""Initialize text-grabbing state.
Normally, this function is called from 'startDocument'."""
self.grabbing = False
def startGrab(self):
"""Start saving text from XML files.
Any previously saved text is discarded."""
self.grabbing = True
self.grab_content = ""
def endGrab(self):
"""Stop saving text.
Returns text saved since the last call to 'startGrab'."""
self.grabbing = False
return self.grab_content
# Override 'ContentHandler' method.
def characters(self, content):
# Save characters during a grab.
if self.grabbing:
self.grab_content += content
class IDHandler(TextGrabber):
"""Class for extracting PubMed IDs from ESearch XML results."""
# Override 'ContentHandler' method.
def startDocument(self):
# Initialize 'TextGrabber' state.
self.initGrab()
# We are not currently grabbing a PubMed ID. (see below)
self.id_grab = False
# Set list of saved PubMed IDs to empty list.
self.id_list = []
# Override 'ContentHandler' method.
def startElement(self, name, attrs):
# PubMed IDs are the text within 'Id' tags in
# ESearch output, so we start grabbing IDs
# whenever we see an opening tag "<Id>"
if name == "Id":
self.id_grab = True
self.startGrab()
# Override 'ContentHandler' method.
def endElement(self, name):
# When we see a closing tag "</Id>", we
# check if we are grabbing an Id (we _should_
# be, but it does not hurt to be sure).
# If we are, terminate the grab and add the
# saved text to our PubMed ID list.
if name == "Id" and self.id_grab:
self.id_list.append(self.endGrab())
self.id_grab = False
if __name__ == "__main__":
# Basic test program
import pprint
id_list = get_pubs_for_author("EL-SAMAD H")
pprint.pprint(id_list)