python爬取36大数据文件代码
# -*- coding:UTF-8 -*-import urllib2import reimport osimport timefrom sgmllib import SGMLParserfrom pyquery import PyQuery as pqfrom lxml import etreeimport urllibimport sys import httplibhttplib.HTTPConnection._http_vsn = 10httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'reload(sys)sys.setdefaultencoding( "utf-8" )#获取网页def getHtml(url): request = urllib2.Request(url) response = urllib2.urlopen(request) html = unicode(response.read(),'utf-8') return html#保存网页def saveHtml(filepath,html): file_dir = os.path.split(filepath)[0] if not os.path.isdir(file_dir): os.makedirs(file_dir) if os.path.exists(filepath): os.remove(filepath) file = open(filepath,'w') file.write(html) file.close()#读文件def readHtml(filepath): f = open(filepath,'rb') return unicode(f.read(),'utf-8') #解析网页def resolveHtml(filepath): d = pq(filename=filepath) return d def resolveBlog(content): d_cont = pq(content) #文章标题 title = d_cont('h2').text() #文章href href = d_cont('h2').find('a').attr('href') #文章ID id = href.split('/')[-1] #作者 时间 类别 au_tm_cat = d_cont('p').filter('.info').text() author = au_tm_cat.split()[0] date = au_tm_cat.split()[1] cat = au_tm_cat.split()[2] #内容 note = d_cont('p').filter('.note').text() blog = [id,title,href,author,date,cat,note] return blogurl = 'http://www.36dsj.com/'url = 'http://www.91333.com/pk10/'url = 'http://baidu.lecai.com/lottery/draw/view/557/622132?agentId=5563'url = 'http://www.36dsj.com/'#html = getHtml(url)def crawlpage(url): page = urllib2.urlopen(url) text = unicode(page.read(),"utf-8") d = pq(text) bloglist = [] for i in range(d('article').filter('.excerpt').length): content = d('article').filter('.excerpt').eq(i).html() a = resolveBlog(content) #print a[5] bloglist.append(a) return bloglistdef crawler(url): article = crawlpage(url) for i in article: print i[0],i[1],i[2] html = getHtml(i[2]) htmlname = i[2].split('/')[-1] d = pq(html) s = d('article').html() saveHtml('/etl/etldata/script/tmpdir/html/'+htmlname+'.html',s) saveHtml('/etl/etldata/script/tmpdir/html/'+htmlname+'.title',i[1]) imgurl = d('img').attr('src') #imgname = i[2].split('/')[-1] #print imgname ir = imgurl.encode("utf-8") #print ir urllib.urlretrieve(ir,'/etl/etldata/script/tmpdir/image/'+htmlname+'.jpg') #saveHtml('/etl/etldata/input/html/'+i[0]+'.html',html)url_base='http://www.36dsj.com/page/'for i in range(1,5): url = url_base+str(i) print url try : crawler(url) except: continue#url = 'http://www.36dsj.com/page/1'#crawler(url)
Shell调用python爬取36大数据文章
#!/bin/bashalias dt='date +%Y-%m-%d" "%H:%M:%S'shopt -s expand_aliasespypath=/usr/local/python2.7/bindir=/etl/etldata/scripta=`echo $RANDOM`a=`ls ${ dir}/tmpdir/image/ | wc -l`echo "ALL:${a}"num=`awk 'BEGIN{srand();sum=rand()*20;printf("%d\n",sum)}'`if [ ${num} -eq 0 ];then num=1fi#num=`awk 'BEGIN{srand();sum=rand()*20;printf("%d\n",sum)}'`echo ${num}#num=1#dir=/etl/etldata/script$pypath/python /${ dir}/tmpdir/crawler_36.py ls -ltr ${ dir}/tmpdir/image/ | awk '{print $NF}' | sed 's/.jpg/''/g' > ${ dir}/tmp#id=`cat ${ dir}/tmp | sed -n "${num}p" | awk '{print $1}'`id=`cat ${ dir}/tmp | head -${num} | awk '{print $1}' | tail -1` echo "`dt`:${id}" $pypath/python ${ dir}/tmpdir/mail.py ${ id}
python爬取网页
# -*- coding:UTF-8 -*-import urllib2import reimport osimport timefrom sgmllib import SGMLParserfrom pyquery import PyQuery as pqfrom lxml import etreeimport urllibimport sys import requestsimport jsonimport chardetreload(sys)sys.setdefaultencoding( "utf-8" )def crawlpage(url): page = urllib2.urlopen("http://www.36dsj.com/") text = unicode(page.read(),"utf-8") d = pq(text) bloglist = [] for i in range(d('article').filter('.excerpt').length): content = d('article').filter('.excerpt').eq(i).html() a = resolveBlog(content) #print a[5] bloglist.append(a) return bloglist#通过url获取网页def getHtml(url): request = urllib2.Request(url) response = urllib2.urlopen(request) html = unicode(response.read(),'utf-8') return html def get(url): request = urllib2.Request(url) response = urllib2.urlopen(request) html = response.read() s1 = sys.getfilesystemencoding() s2 = chardet.detect(html)['encoding'] print s1,s2 #print html return html.decode(s2).encode(s1)#保存网页内容def saveHtml(filepath,html): file_dir = os.path.split(filepath)[0] if not os.path.isdir(file_dir): os.makedirs(file_dir) if os.path.exists(filepath): os.remove(filepath) file = open(filepath,'a') file.write(html) file.close()#读网页文件def readHtml(filepath): f = open(filepath,'rb') return unicode(f.read(),'utf-8') #解析网页def resolveHtml(filepath): d = pq(filename=filepath) return d def resolveBlog(content): d_cont = pq(content) #文章标题 title = d_cont('h2').text() #文章href href = d_cont('h2').find('a').attr('href') #文章ID id = href.split('/')[-1] #作者 时间 类别 au_tm_cat = d_cont('p').filter('.info').text() author = au_tm_cat.split()[0] date = au_tm_cat.split()[1] cat = au_tm_cat.split()[2] #内容 note = d_cont('p').filter('.note').text() blog = [id,title,href,author,date,cat,note] return blog def GetHtml(url): page = urllib.urlopen(url) contex = page.read() return contex def GetData(cont): data = pq(cont) #日期: date_id = data('td').eq(0).text() code_id = data('td').eq(1).text() A = data('td').eq(2).text() s = date_id+' '+code_id+' '+A+'\r\n' return surl = 'http://www.36dsj.com/'url = 'http://www.91333.com/pk10/'url = 'http://baidu.lecai.com/lottery/draw/view/557/622132?agentId=5563'url = 'http://www.36dsj.com/'url = 'http://www.weather.com.cn/weather1d/101240303.shtml#input'url = 'http://www.nmc.gov.cn/publish/forecast/AJX/wuyuan.html'url = 'http://php.weather.sina.com.cn/search.php?city=%E6%C4%D4%B4&dpc=1'#url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html'#url = 'http://hq.sinajs.cn/list=sh601006'def craw_pk10(date_id): url = 'http://baidu.lecai.com/lottery/draw/list/557?d='+date_id html =getHtml(url) d = pq(html) data=d('table').filter('#draw_list').find('tbody').find('tr') print data.length for i in range(data.length): s = GetData(data.eq(i)) a = open('/home/hadoop/python/PK10_20170610.csv','a') a.write(s) a.close() #saveHtml('/home/hadoop/python/2017-06-10.csv',s)