# -*- coding: cp949 -*- import sys import urllib2 import csv from bs4 import BeautifulSoup def parse_chart(html): soup = BeautifulSoup(html) date = soup.find('div', class_='dateChart clearfix').p.span.text tables = soup.select('.boardListWhite') top50 = tables[0].tbody.find_all('tr', recursive=False) top100 = tables[1].tbody.find_all('tr', recursive=False) #with open('top50.html', 'w') as f: # for item in top50: # f.write(item.prettify().encode('utf-8')) #with open('top100.html', 'w') as f: # for item in top100: # f.write(item.prettify().encode('utf-8')) chart = parse_tr(top50) chart += parse_tr(top100) return date, tuple(chart) def parse_tr(trs): items = [] for tr in trs: tds = tr.find_all('td', recursive=False) # 0th td: Checkbox. No Process # 1st td: Rank. If 'alt' has not a number, skip this tds. alt = tds[1].img['alt'] rank = 0 if alt.isnumeric() == False: continue else: rank = int(alt) # 2nd td: delta updn = tds[2].span['class'][0] delta = tds[2].span.text if updn == u'up2': delta = u'+' + delta elif updn == u'down2': delta = u'-' + delta # 3rd td: song name, artist name, album title. links = tds[3].find_all('a', title=True) morelist = tds[3].find('span', class_='overList') if morelist: song = links[0]['title'] artist = ', '.join([a.text for a in morelist.find_all('a')]) album = links[-1]['title'] else: song = links[0]['title'] artist = links[1]['title'] album = links[2]['title'] items.append((rank, delta, song, artist, album)) return items def get_html(url, maxbuf = 10485760): res = urllib2.urlopen(url) html = res.read(maxbuf) res.close() #f = open('melon_weekly.html', 'w') #f.write(html) #f.close() #with open('melon_weekly.html', 'r') as f: # html = f.read() return html def main(argv): if len(argv) != 2: print u'Usage: python melon_top_100.py [daily|weelky|monthly]' return 0 charturl = {'daily': 'http://www.melon.com/static/cds/chart/web/chartdaily_list.html', \ 'weekly': 'http://www.melon.com/static/cds/chart/web/chartdaily_list.html', \ 'monthly': 'http://www.melon.com/static/cds/chart/web/chartmonthly_list.html'} html = get_html(charturl[argv[1]]) title, chart = parse_chart(html) file_name = title + u'.csv' with open(file_name, 'wb') as f: writer = csv.writer(f, csv.excel) for item in chart: cp949item = [unicode(i).encode('cp949') for i in item] writer.writerow(cp949item) print u'Chart exported to \'%s\'' % file_name return 0 if __name__ == '__main__': sys.exit(main(sys.argv))