[点击蓝字,一键关注~]
还记得之前的一次图片爬虫的教程吗~,原理是类似的,想把今年的论文都下载下来,虽然不一定(肯定不)都看,没事的时候拜一拜~~也行。然后就自己写了个脚本,如果你也想下载的话可以用这个比较方便。有时间,我也会把我看过的今年的理解比较好的文章,和大家分享一下~。
import urllib
import re
import os
import urlparse
def auto_down(url, filename):
try:
image = urllib.urlretrieve(url, filename)
except urllib.ContentTooShortError:
image = auto_down(url, filename)
return image
def down_load_cvpr():
urls = "http://openaccess.thecvf.com/CVPR2017.py"
try:
page = urllib.urlopen(urls)
html = page.read()
reg = r'href="(.+?\.pdf)"'
imgre = re.compile(reg)
pdf_list = re.findall(imgre, html)
for pdf in pdf_list:
target_path = "./CVPR_2017_paper"
pdf_url = "http://openaccess.thecvf.com/" + str(pdf)
url_parse = urlparse.urlparse(pdf_url)
file_name = url_parse.path.split("/")[-1]
if not os.path.exists(target_path):
os.makedirs(target_path)
target = os.path.join(target_path, '{}'.
format(file_name))
print "[*] Downloading paper :{}".format(pdf_url)
auto_down(pdf_url, target)
except:
pass
if __name__ == "__main__":
down_load_cvpr()
就酱紫了,晚安!