博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Python爬虫
阅读量:4632 次
发布时间:2019-06-09

本文共 6703 字,大约阅读时间需要 22 分钟。

python爬取36大数据文件代码

# -*- coding:UTF-8 -*-import urllib2import reimport osimport timefrom sgmllib import SGMLParserfrom pyquery import PyQuery as pqfrom lxml import etreeimport urllibimport sys import httplibhttplib.HTTPConnection._http_vsn = 10httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'reload(sys)sys.setdefaultencoding( "utf-8" )#获取网页def getHtml(url):    request = urllib2.Request(url)    response = urllib2.urlopen(request)    html = unicode(response.read(),'utf-8')    return html#保存网页def saveHtml(filepath,html):    file_dir = os.path.split(filepath)[0]    if not os.path.isdir(file_dir):       os.makedirs(file_dir)    if os.path.exists(filepath):       os.remove(filepath)    file = open(filepath,'w')    file.write(html)    file.close()#读文件def readHtml(filepath):     f  = open(filepath,'rb')     return unicode(f.read(),'utf-8')    #解析网页def resolveHtml(filepath):    d = pq(filename=filepath)    return d    def resolveBlog(content):    d_cont = pq(content)    #文章标题    title = d_cont('h2').text()    #文章href    href = d_cont('h2').find('a').attr('href')    #文章ID    id = href.split('/')[-1]    #作者 时间 类别    au_tm_cat = d_cont('p').filter('.info').text()    author = au_tm_cat.split()[0]    date = au_tm_cat.split()[1]    cat = au_tm_cat.split()[2]    #内容    note = d_cont('p').filter('.note').text()    blog = [id,title,href,author,date,cat,note]    return blogurl = 'http://www.36dsj.com/'url = 'http://www.91333.com/pk10/'url = 'http://baidu.lecai.com/lottery/draw/view/557/622132?agentId=5563'url = 'http://www.36dsj.com/'#html = getHtml(url)def crawlpage(url):    page = urllib2.urlopen(url)    text = unicode(page.read(),"utf-8")    d = pq(text)    bloglist = []    for i in range(d('article').filter('.excerpt').length):        content = d('article').filter('.excerpt').eq(i).html()        a = resolveBlog(content)        #print a[5]        bloglist.append(a)    return bloglistdef crawler(url):    article = crawlpage(url)    for i in article:        print i[0],i[1],i[2]        html = getHtml(i[2])        htmlname = i[2].split('/')[-1]        d = pq(html)        s = d('article').html()        saveHtml('/etl/etldata/script/tmpdir/html/'+htmlname+'.html',s)        saveHtml('/etl/etldata/script/tmpdir/html/'+htmlname+'.title',i[1])         imgurl =  d('img').attr('src')        #imgname = i[2].split('/')[-1]        #print imgname        ir = imgurl.encode("utf-8")        #print ir        urllib.urlretrieve(ir,'/etl/etldata/script/tmpdir/image/'+htmlname+'.jpg')        #saveHtml('/etl/etldata/input/html/'+i[0]+'.html',html)url_base='http://www.36dsj.com/page/'for i in range(1,5):    url = url_base+str(i)    print url    try :       crawler(url)    except:       continue#url = 'http://www.36dsj.com/page/1'#crawler(url)

Shell调用python爬取36大数据文章

#!/bin/bashalias dt='date +%Y-%m-%d" "%H:%M:%S'shopt -s expand_aliasespypath=/usr/local/python2.7/bindir=/etl/etldata/scripta=`echo $RANDOM`a=`ls  ${
dir}/tmpdir/image/ | wc -l`echo "ALL:${a}"num=`awk 'BEGIN{srand();sum=rand()*20;printf("%d\n",sum)}'`if [ ${num} -eq 0 ];then num=1fi#num=`awk 'BEGIN{srand();sum=rand()*20;printf("%d\n",sum)}'`echo ${num}#num=1#dir=/etl/etldata/script$pypath/python /${
dir}/tmpdir/crawler_36.py ls -ltr ${
dir}/tmpdir/image/ | awk '{print $NF}' | sed 's/.jpg/''/g' > ${
dir}/tmp#id=`cat ${
dir}/tmp | sed -n "${num}p" | awk '{print $1}'`id=`cat ${
dir}/tmp | head -${num} | awk '{print $1}' | tail -1` echo "`dt`:${id}" $pypath/python ${
dir}/tmpdir/mail.py ${
id}

 python爬取网页

# -*- coding:UTF-8 -*-import urllib2import reimport osimport timefrom sgmllib import SGMLParserfrom pyquery import PyQuery as pqfrom lxml import etreeimport urllibimport sys import requestsimport jsonimport chardetreload(sys)sys.setdefaultencoding( "utf-8" )def crawlpage(url):    page = urllib2.urlopen("http://www.36dsj.com/")    text = unicode(page.read(),"utf-8")    d = pq(text)    bloglist = []    for i in range(d('article').filter('.excerpt').length):        content = d('article').filter('.excerpt').eq(i).html()        a = resolveBlog(content)        #print a[5]        bloglist.append(a)    return bloglist#通过url获取网页def getHtml(url):    request = urllib2.Request(url)    response = urllib2.urlopen(request)    html = unicode(response.read(),'utf-8')    return html    def get(url):    request = urllib2.Request(url)    response = urllib2.urlopen(request)    html = response.read()    s1 = sys.getfilesystemencoding()    s2 = chardet.detect(html)['encoding']     print s1,s2    #print html    return html.decode(s2).encode(s1)#保存网页内容def saveHtml(filepath,html):    file_dir = os.path.split(filepath)[0]    if not os.path.isdir(file_dir):       os.makedirs(file_dir)    if os.path.exists(filepath):       os.remove(filepath)    file = open(filepath,'a')    file.write(html)    file.close()#读网页文件def readHtml(filepath):     f  = open(filepath,'rb')     return unicode(f.read(),'utf-8')    #解析网页def resolveHtml(filepath):    d = pq(filename=filepath)    return d    def resolveBlog(content):    d_cont = pq(content)    #文章标题    title = d_cont('h2').text()    #文章href    href = d_cont('h2').find('a').attr('href')    #文章ID    id = href.split('/')[-1]    #作者 时间 类别    au_tm_cat = d_cont('p').filter('.info').text()    author = au_tm_cat.split()[0]    date = au_tm_cat.split()[1]    cat = au_tm_cat.split()[2]    #内容    note = d_cont('p').filter('.note').text()    blog = [id,title,href,author,date,cat,note]    return blog        def GetHtml(url):      page = urllib.urlopen(url)      contex = page.read()      return contex  def GetData(cont):    data = pq(cont)    #日期:    date_id = data('td').eq(0).text()    code_id = data('td').eq(1).text()    A = data('td').eq(2).text()    s = date_id+' '+code_id+' '+A+'\r\n'    return surl = 'http://www.36dsj.com/'url = 'http://www.91333.com/pk10/'url = 'http://baidu.lecai.com/lottery/draw/view/557/622132?agentId=5563'url = 'http://www.36dsj.com/'url = 'http://www.weather.com.cn/weather1d/101240303.shtml#input'url = 'http://www.nmc.gov.cn/publish/forecast/AJX/wuyuan.html'url = 'http://php.weather.sina.com.cn/search.php?city=%E6%C4%D4%B4&dpc=1'#url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html'#url = 'http://hq.sinajs.cn/list=sh601006'def craw_pk10(date_id):    url = 'http://baidu.lecai.com/lottery/draw/list/557?d='+date_id    html =getHtml(url)    d = pq(html)    data=d('table').filter('#draw_list').find('tbody').find('tr')    print data.length    for i in range(data.length):        s = GetData(data.eq(i))        a = open('/home/hadoop/python/PK10_20170610.csv','a')        a.write(s)        a.close()    #saveHtml('/home/hadoop/python/2017-06-10.csv',s)

 

转载于:https://www.cnblogs.com/Jims2016/p/7134753.html

你可能感兴趣的文章
RCP学习笔记
查看>>
CentOS双网卡双IP设置
查看>>
敏捷软件开发:原则、模式与实践——第2章 极限编程概述
查看>>
sql优化------查询整个表按照某个字段排序后的前几条
查看>>
【清华集训】楼房重建 BZOJ 2957
查看>>
【sdoi2013】森林 BZOJ 3123
查看>>
本地建立svn管理项目
查看>>
SPSS单一样本的T检验
查看>>
记一次代码错误的排查
查看>>
In c++ access control works on per-class basis not on per-object basis.
查看>>
关于c++预处理器错误(待续)
查看>>
清空进程间通信消息队列
查看>>
Marmoset Toolbag中的角色布光技巧 by Joe”EarthQuake”Wilson
查看>>
http://forum.jquery.com/topic/file-upload-ajaxsubmit-sends-response-to-wrong-window-in-ie
查看>>
php的文件下载
查看>>
easyui的增删改
查看>>
Sql Server数据库性能优化之索引
查看>>
【Android UI】 Shape详… 分类: ...
查看>>
MFC 屏幕截图(libjpeg bmp转jpg)
查看>>
jQuery中 wrap() wrapAll() 与 wrapInner()的区别
查看>>