前两天学校里拍了学位照,在档案馆网站上可以看到自己照片了,由于只要学号和姓名信息,所以就写了个脚本把照片给全部抓下来了,主要用到了urllib2,urllib,汗,以前一直用很底层的httplib做东西,还有这次写了一下代码对浏览器如何操作整个请求过程有了比较深入的了解,脚本执行时需要同目录下有个数据文件,其中包含学号和姓名信息,用逗号隔开,代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | import os import urllib import urllib2 import re def save_image(dir,image_name,image): if not os.path.isdir(dir): os.makedirs(dir) try: image_file = open(dir + image_name,'wb') except IOError as (error, strerror): print "I/O error({0}):{1}".format(error,strerror) else: image_file.write(image) image_file.close() def get_image(name,student_no,login_url,picture_url,pattern): #Parameter Information login_type = 'yes' #Configure opener to handle cookies opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) urllib2.install_opener(opener) #Build Prameters params = urllib.urlencode({'xm':name,'xh':student_no,'login':login_type}) #Open login html f = opener.open(login_url,params) login_html = f.read() f.close() #Search the image link m = re.search(pattern,login_html) if m is None:#The student hasn't taken picture return None else: #Get the image match_part = m.group(1) f = opener.open(picture_url + match_part) image = f.read() return image def grab_all(): dir = r'D:\\temp\\' login_url = 'http://dawww.nju.edu.cn/xwz/login.asp' picture_url = 'http://dawww.nju.edu.cn/xwz/picture.asp' picture_pattern = 'src="picture.asp(\?i=\d+)"' try: software_student_file = open('software.txt','r') except IOError as (error, strerror): print "I/O error({0}):{1}".format(error,strerror) else: lines = software_student_file.readlines() for line in lines: line = line.rstrip('\n') name = line.split(',')[0] student_no = line.split(',')[1] image = get_image(name,student_no,login_url,picture_url,picture_pattern) if image is not None: save_image(dir,student_no + '_' + name + '.jpg',image) print student_no + '_' + name + '.jpg' + ' is saved' software_student_file.close() if __name__ == "__main__": grab_all() |
