from urllib import urlopen import threading import re import Queue
defget_info(url): '''abstracts product info ''' p = re.compile('<tr><td .*?>(.*?)</td><td>(.*?)</td></tr>') text = urlopen(url).read() info = []
for t, v in p.findall(text): info.append((t, v))
return info
defget_urls(page_url): '''gets product urls from a page''' p = re.compile(r"<div class='p-name'><a target='_blank' href='(.*?)'>", re.S) text = urlopen(page_url).read()
urls = [] for url in p.findall(text): urls.append(url)
return urls
defget_page_urls(): '''creates urls of the pages''' page_urls = [] for i in range(1, 23): page_urls.append('http://www.360buy.com/products/670-671-672-0-0-0-0-0-0-0-1-1-%d.html'% i) return page_urls
defproduct_worker(product_url_queue): '''thread of product worker, downloads product info''' global g_mutex, f, g_c, g_done whilenot g_done or product_url_queue.qsize() > 0: url = product_url_queue.get() try: info = get_info(url) except Exception, e: product_url_queue.put(url) print e continue
g_mutex.acquire() print'==>', g_c g_c += 1 for t, v in info: f.write(t + ':::' + v + '\n') f.write('\n#####\n') f.flush() g_mutex.release()
defpage_urls_worker(product_url_queue, page_url): '''thread function of page urls worker, downloads page urls''' for product_url in get_urls(page_url): product_url_queue.put(product_url) print'.'
f = open('data.txt', 'w') # output file g_c = 0# counter, for telling us the process g_done = False# end flag g_mutex = threading.Lock() # mutex