#!/usr/bin/python
# -*- coding: iso-8859-1 -*-
'''
bashfr_download.py 1.0.0

This program downloads the whole bashfr.org archive into a single HTML file.
http://www.bashfr.org/?sort=browse&p=1

Why: I wanted to read bashfr.org offline. 

License: This program is public domain.

Author: Sébastien SAUVAGE (webmaster of http://sebsauvage.net)
'''

# Use this if you use a proxy (or define the HTTP_PROXY variable in your environment)
#import os; os.environ['HTTP_PROXY'] = 'http://proxy.myisp.com:3128'

import sys,urllib2,re

currentPage = 1
print 'bashfr_download 1.0.0'
sys.stdout.write('Downloading page...')

re_quote = re.compile('<div class="quote-1">#<a href="\?(\d+)">.+?</a><br />(.+?)</div>',re.IGNORECASE|re.DOTALL)

quotes = {}   # List of quotes.   key=quote number, value=the quote itself (HTML code)

while True:
    html = urllib2.urlopen('http://www.bashfr.org/?sort=browse&p=%d' % currentPage).read(200000)
    sys.stdout.write('.')
    if not '&gt;&gt;&gt;</a>' in html:
        break
    
    for (number,quote) in re_quote.findall(html):
        quotes[int(number)] = quote.decode('ISO-8859-1')
    
    currentPage += 1

body = u''
for number in sorted(quotes.keys()):
    body += u'<hr><b>%d</b><br>%s' % (number,quotes[number])
body += u'<hr>'

html = u'''<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>http://www.bashfr.org/</tile>
<style type="text/css">
<!--
body { font-family: monospace; font-size:10pt;} 
-->
</style>
</head>
<body><b>http://www.bashfr.org/</b>%s</body>
</html>''' % body

file = open('bashfr_quotes.html','w+b')
file.write(html.encode('utf-8'))
file.close()
