"""
tfd.py -- Scrapes a certain set of comics' websites, and spits
          out home-grown RSS feeds for them. 

It should be fairly obvious by even Googling the name of the script
(without .py) what comic this was written for, and it also works
for it's two sister sites which are run off the same PHP code that
the author presumably wrote himself.

For a while the author posted all of his comics to livejournal accounts
which allowed those of us who consume our daily dose of web through
RSS readers to just use the LJ feed. This practice sadly discontinued,
and Drew has been quite touchy in the past about publicly accessible
"fan feeds" and had them taken down. 

In order that we respect his wishes it's advisable that you use this 
only for PRIVATE FEEDS, that you and a friend use, or something. If 
one of you has a shared hosting account or a shell account, set it
up to run as a cron job and dump the files somewhere web-accessible,
preferably protected by a .htpasswd style authentication. Drew doesn't
want there to be an 'open-to-everybody' RSS feed for whatever reason
and I'll respect that, but I'll also say that the die-hard nerds 
viewing his site in an alternative format than he intended is just 
something he'll have to put up with.

-----------------------------------------------------------------------

Copyright  (c) 2007-2008 David Warde-Farley.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

  * Redistributions of source code must retain the above copyright
    notice, this list of conditions and the following disclaimer.

  * Redistributions in binary form must reproduce the above
    copyright notice, this list of conditions and the following
    disclaimer in the documentation and/or other materials provided
    with the distribution.

  * Neither the name of David Warde-Farley nor the names of
    its contributors may be used to endorse or promote products
    derived from this software without specific prior written
    permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
import sgmllib
import re
import urllib
import sys
import xml.dom.minidom as minidom
import time, datetime

DEBUG = False


"""Mappings from the short acronyms to the main part of the domains."""
longnames = { 
    'mtts' : 'marriedtothesea',
    'tfd'  : 'toothpastefordinner',
    'nd'   : 'nataliedee'
    }

"""Months of the year. You'd think there's be a simple built-in way?"""
months = ['jan','feb','mar','apr','may','jun',
    'jul','aug','sep','oct','nov','dec']

class ToothpasteScraper(sgmllib.SGMLParser):
    
    """
    Scrapes a certain comic's main website (as of 03/13/08) to grab a list
    of recent comics for further processing. A list of tuples is available
    in the instance variable 'comic_list' after processing. 
    
    Essentially works from a 'Visitor' pattern with hooks provided by 
    SGMLParser, we just look for the links that are actual links to comics 
    (or the title) and maintain some state to process stuff between those 
    tags only when relevant (start_a and start_title set flags that 
    let the next handle_data call know that he's up at bat for something.)
    """
            
    def __init__(self, comic=None):
        """
        Instantiate a ToothpasteScraper. If an URL is given, the URL is 
        opened with urllib and feed() is called immediately with the 
        resulting string.
        """
        # Empty comic list.
        self.comic_list = []

        # Title - uhhhh.
        self.comic_title = "(No comic title)"

        # Boolean state flags so that we know when to care about textual data
        self.parsing_comic_link = False
        self.parsing_title = False

        # Regular expression we'd like to match against.
        self.URL_PATTERN = \
            r"http://www.%s.com/[01]\d{5}/([a-zA-Z_]|\-)+\.(gif|jpg)" % \
            (longnames[comic],)

        # Instantiate superclass.
        sgmllib.SGMLParser.__init__(self)
        
        self._go(comic)

    def _go(self,comic): 
        """Starts the parser to work with data fed from an URL connection."""
        today = datetime.datetime.today()
        month = months[today.month - 1]
        year = str(today.year)[-2:]
        url = 'http://www.%s.com/%s-archives/%sarchive-%s%s.php' % \
        (longnames[comic], comic, comic, month, year)
        if DEBUG:
            print >> sys.stderr, "Fetching " + url
        handle = urllib.urlopen(url)
        data = handle.read()
        handle.close()
        self.feed(data)
        self.close()
    
    
    def start_a(self,attributes):
        """
        Called when an opening anchor tag is encountered. Basically regex
        matches HREF for comic links and scrapes it on match.
        """
        atts = dict(attributes)
        if 'href' in atts and re.match(self.URL_PATTERN, atts['href']):
            self.parsing_comic_link = True
            if DEBUG:
                print >>sys.stderr, atts['href'].split("/")
            raw_d = atts['href'].split("/")[3]
            self.cur_date = "%s/%s/%s" % (raw_d[0:2],raw_d[2:4],raw_d[4:6])
            self.cur_link = atts['href']

    def handle_data(self, data):
        """
        Handles arbitrary data, but only actually does anything if we're
        inside a comic link or a title tag.
        """
        if self.parsing_comic_link:
            self.cur_text = data.strip() # Not sure if the strip is necessary.
        elif self.parsing_title:
            self.comic_title = "".join(data.strip().split("\n"))

    def end_a(self):
        """
        Called when an ending anchor tag is encountered. If a link is being
        read, it's finalized and added to the list. Otherwise nothing 
        happens.
        """
        if self.parsing_comic_link:
            self.comic_list.append((self.cur_link,self.cur_date,self.cur_text))
            del self.cur_link, self.cur_date, self.cur_text

        # Done outside the if for robustness or something.
        self.parsing_comic_link = False

    def start_title(self, attributes):
        """Used to scrape the title of the comic from the TITLE tag."""
        self.parsing_title = True

    def end_title(self):
        """Used to scrape the title of the comic from the TITLE tag."""
        self.parsing_title = False

class RSSMaker:
    def __init__(self,url,title,comics,desc=None):
        """
        Instantiates an RSSMaker, which basically needs a title
        and a list of comics to create itself a DOM tree. A title
        is also kinda nice.
        """
        self.date_format = "%a, %d %b %Y %H:%M:%S %Z"

        if url[-1] != "/":
            url += "/"
        self.url = url

        impl = minidom.getDOMImplementation()
        self.xmldocument = impl.createDocument(None, "rss", None)
        self.xmldocument.documentElement.setAttribute("version","2.0")
        self.chan_tag = self.xmldocument.createElement("channel")
        self.xmldocument.documentElement.appendChild(self.chan_tag)
        
        titletag = self.xmldocument.createElement("title")
        if not title:
            title = "Scraped RSS Feed of " + url
        titletag.appendChild(self._mktxt(title))
        self.chan_tag.appendChild(titletag)        

        linktag = self.xmldocument.createElement("link")
        linktag.appendChild(self._mktxt(url))
        self.chan_tag.appendChild(linktag)

        desctag = self.xmldocument.createElement("description")
        if not desc:
            desc = "No description provided."
        desctag.appendChild(self._mktxt(desc))
        self.chan_tag.appendChild(desctag)
        
        lastbuilddatetag = self.xmldocument.createElement("pubDate")
        date = time.strftime(self.date_format)
        lastbuilddatetag.appendChild(self._mktxt(date))
        self.chan_tag.appendChild(lastbuilddatetag)

        if comics:
            self._load_comics(comics)
    
    def _mktxt(self, text):
        """Convenience method for creating text nodes."""
        return self.xmldocument.createTextNode(text)

    def _load_comics(self, comics):
        """
        Private method called by the constructor that
        actually loads in the comics and creates the relevant
        DOM elements.
        """
        for comic in comics:
            comic_url = comic[0]
            comic_item = self.xmldocument.createElement("item")

            date_nums = [int(x) for x in comic[1].split("/")]
            date_nums[-1] += 2000
            dt = datetime.datetime(date_nums[2],date_nums[0],date_nums[1])
            tup = dt.timetuple()[:-1] + (time.daylight,)
            datestr = time.strftime(self.date_format, dt.timetuple())
            comic_date = self.xmldocument.createElement("pubDate")
            comic_date.appendChild(self._mktxt(datestr))

            comic_title = self.xmldocument.createElement("title")
            comic_title.appendChild(self._mktxt(comic[2]))
            comic_link = self.xmldocument.createElement("link")
            comic_link.appendChild(self._mktxt(comic[0]))
            comic_desc = self.xmldocument.createElement("description")
            desc = "<img src=\"%s\" alt=\"%s\" />"% (comic[0],comic[2])
            comic_desc.appendChild(self._mktxt(desc))
            
            comic_item.appendChild(comic_title)
            comic_item.appendChild(comic_link)
            comic_item.appendChild(comic_desc)
            comic_item.appendChild(comic_date)

            comic_guid = self.xmldocument.createElement("guid")
            comic_guid.appendChild(self._mktxt(comic_url))
            comic_item.appendChild(comic_guid)

            self.chan_tag.appendChild(comic_item)
         
    def getfeed(self):
        """
        Retrieve the XML document in all of it's glory.
        presumably if the constructor didn't bubble up an exception 
        this method should give you a valid RSS feed.
        """
        return self.xmldocument.toprettyxml()

def usage(argv):
    print >> sys.stderr, "usage: %s -c {mtts,tfd,nd}" % (argv[0])


if __name__ == "__main__":
    import getopt
    opts, args = getopt.getopt(sys.argv[1:],'c:')
    try:
        d = dict(opts)
        comic = d['-c']
    except:
        usage(sys.argv)
        sys.exit(1)
    
    scraper = ToothpasteScraper(comic=comic)
    f = RSSMaker("http://www.%s.com/" % (longnames[comic]), \
        title=scraper.comic_title, comics=scraper.comic_list)

    print f.getfeed()
