"""
Script to export comments from Haloscan to an .xml file so they
can be imported into some other blog system.
DISCLAIMER: This is provided "AS-IS", I make no guarantees that this
works, it's based on screen scraping so could stop working whenever
Haloscan change their pages.
According to Haloscan's Terms of Service (http://www.haloscan.com/privacy/)
it's not explicitly forbidden to screen scrape their site. HOWEVER, they say:
"We reserve the right to suspend, delete, or cancel any account/service at
any time for any reason."
This script makes a new http request for every single comment so if you
have thousands of comments and Haloscan doesn't approve of you pounding
their server and suspends your account and you lose all your comments,
I CANNOT BE HELD RESPONSIBLE. USE AT YOUR OWN RISK! You've been warned!
"""
__version__ = '1.0'
__author__ = 'Einar Egilsson'
__date__ = 'August 13th 2007'
__url__ = 'http://tech.einaregilsson.com/2007/08/13/export-haloscan-comments/'
import urllib2, re, os, sys, string
LOGIN_COOKIE = None
def open_url(url, postdata=None, headers={}):
global LOGIN_COOKIE
headers['cookie'] = LOGIN_COOKIE
request = urllib2.Request('http://www.haloscan.com' + url, postdata, headers)
opener = urllib2.build_opener()
url = opener.open(request)
if not LOGIN_COOKIE: LOGIN_COOKIE = url.headers.dict['set-cookie']
return url.read()
def login(username, password):
print 'Logging in...'
vars = { 'entered_login' : username
, 'entered_password' : password
, 'enter' : 'Sign%20In' }
postdata = '&'.join('%s=%s' % (k, vars[k]) for k in vars)
open_url('/members/', postdata)
print 'Login complete'
def export_comments(username, password, outfile, delay=100):
login(username, password)
counter = 0
if os.path.exists(outfile):
file = open(outfile, 'r')
counter = len(file.read().split('<comment>'))-1
file.close()
print 'File already exists, found %s comments' % counter
print 'Starting at comment %s' % (counter+1)
file = open(outfile, 'a')
else:
file = open(outfile, 'w')
file.write('<?xml version="1.0" encoding="ISO-8859-1"?>\n')
file.write("<comments>\n")
xml_template = """
<comment>
<name>%(name)s</name>
<email>%(email)s</email>
<url>%(url)s</url>
<time>%(year)s-%(month)s-%(day)s %(hour)s:%(minute)s:%(sec)s</time>
<ip>%(ip)s</ip>
<thread>%(thread_id)s</thread>
<commentId>%(comment_id)s</commentId>
<text><![CDATA[%(text)s]]></text>
</comment>"""
while True:
html = open_url('/members/posts.php?start=%s' % counter)
comments = re.findall(r'(\d+)</a></td><td><a href="editpost.php\?post\=(\d+)"', html)
if len(comments) == 0:
break
for thread_id, comment_id in comments:
html = open_url('/members/editpost.php?post=%s' % comment_id)
values = { 'thread_id' : thread_id, 'comment_id' : comment_id }
for val in ('name', 'email', 'url'):
values[val] = re.search(r'<input name="edit%s".*?value="(.*?)"' % val, html).group(1)
for val in ('day', 'year', 'hour', 'minute', 'sec'):
values[val] = string.zfill(re.search(r' name="t%s" value="?(\d+)"' % val, html).group(1),2)
values['ip'] = re.search(r'<b>IP</b>:\s*(.*?)<br />', html).group(1)
values['text'] = re.search(r'<textarea.*?name="editmessage".*?>(.*?)</textarea>', html, re.DOTALL).group(1)
values['month'] = string.zfill(re.search(r'<option value="(\d+)" selected="selected">', html).group(1),2)
counter += 1
values['number'] = counter
file.write(xml_template % values)
print 'Comment %(number)s: %(name)s at %(month)s %(day)s, %(year)s - %(hour)s:%(minute)s:%(sec)s' % values
file.flush()
file.write('\n</comments>')
file.close()
if __name__ == '__main__':
print __doc__[__doc__.index('DISCLAIMER'):]
print 'Haloscan Comment Exporter v%s' % __version__
print '%s\n' % __url__
user = raw_input('Username: ')
password = raw_input('Password: ')
export_comments(user, password, user + '.xml')