本文作者:GoGo闯

Python-Flask搜索结果缓存蜘蛛池测试

GoGo闯 4个月前 ( 04-25 ) 391 1条评论
摘要: python-flask script:# coding:utf-8'''搜索结果镜像站群   &nbs...

Snip20190425_5.png


python-flask script:

# coding:utf-8

'''
搜索结果镜像站群    Auto:GoGo闯    Update:2019-04-25
'''

from flask import Flask, request, redirect, render_template, send_from_directory
import datetime, json,sys,random,re,hashlib,requests,json,simplejson
from  cachetools.func import ttl_cache

import MySQLdb as mdb
con = mdb.connect(host='localhost',port=3306,user='root',passwd='password',db='ceshi',charset='utf8',unix_socket='/tmp/mysql.sock');

reload(sys)
sys.setdefaultencoding('utf8')

app = Flask(__name__)

pagedata = {
    "domain.com":{
        "index_title":"蜘蛛测试",
        "index_desc":"这是蜘蛛池测试的站点",
        #"out_links":out_links(),
        "muban":1,
        },  
}

today = datetime.date.today()

def md5(src):
    m2 = hashlib.md5()
    m2.update(src)
    return  m2.hexdigest()

'''新的关键词,写入到words(关键词表)'''
def input_mysql_words(word):
    try:
        cur = con.cursor()
        cur.execute('insert into words values ("%s","%s","%s")' % (word,md5(word),today) )
        con.commit()
    except:
        con.rollback()

'''执行写入数据的sql操作'''
def input_data(sql):
    try:
        cur = con.cursor()
        cur.execute(sql)
        con.commit()
    except:
        con.rollback()

'''获取一定数量的关键词,展现在首页'''
def findall_mysql_words():
    words = []
    cur = con.cursor()
    cur.execute('select * from words order by rand() limit 300')
    number = int(cur.rowcount)

    for i in range(number):
        row = cur.fetchone()
        word = {
            'word':row[0],'md5':row[1],'add_date':row[2],
        }
        words.append(word)
    return words

'''根据md5获取关键词信息'''
def md5_words(md5):
    cur = con.cursor()
    cur.execute('select * from words where md5="%s" ' % md5)
    row = cur.fetchone()

    dict = {'word':row[0],'md5':row[1],'add_date':row[2]}
    return dict

def get_ua():
    '''判断当前访客是否为搜索引擎,是则spider为1,不是为0'''
    ua = request.headers.get('User-Agent')
    if search("(Spider|spider|bot|Bot)", ua,1) != 'no':
        spider = 1
        if 'Sogou web spider' in ua:
            sogou = 1
        else:
            sogou = 0
    else:
        spider = 0
        sogou = 0

    return {"spider":spider,"sogou":sogou}

def search(req,html,n):
    text = re.search(req,html)
    if text:
        data = text.group(n)
    else:
        data = 'no'
    return data

'''判断list表是否存在关键词'''
def judge_lists_word(md5):
    cur = con.cursor()
    cur.execute('select count(1) from text where md5="%s" ' % md5)
    now = cur.fetchone()
    return now[0]

'''读取关键词内容'''
def read_word_wxtext(md5):
    wx_dict = []
    cur = con.cursor()
    cur.execute('select * from text where md5="%s"' % md5)
    number = int(cur.rowcount)
    for i in range(number):
        row = cur.fetchone()
        title = row[1]
        desc = row[2]
        auto = row[3]
        imgurl = row[4]
        textlink = row[5]
        
        wx_dict.append({
            'title':title,
            'desc':desc,
            'auto':auto,
            'imgurl':imgurl,
            'textlink':textlink,
        })
    return wx_dict

def read_imgurl(id):
    cur = con.cursor()
    cur.execute('select imgurl from imgurl where id="%s" ' % (id))
    data = cur.fetchone()[0]
    return data


def read_xgword(md5):
    xgword_list = []
    cur = con.cursor()
    cur.execute('select * from xg_word where md5="%s"' % md5)
    data = cur.fetchone()
    for i in data[2].split("@"):
        xgword = i.split('|')[0]
        xgmd5 = i.split('|')[1]
        xgword_list.append({
            'xgword':xgword,'xgmd5':xgmd5,
        })
    return xgword_list

def getHTml(url):

    headers = {
        "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding":"gzip, deflate, sdch",
        "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",
        "Cache-Control":"no-cache",
        "Connection":"keep-alive",
        #"Cookie":"__cfduid=df26a7c536a0301ccf36481a14f53b4a81469608715; BIDUPSID=E9B0B6A35D4ABC6ED4891FCC0FD085BD; PSTM=1474352745; lsv=globalTjs_97273d6-wwwTcss_8eba1c3-routejs_6ede3cf-activityControllerjs_b6f8c66-wwwBcss_eabc62a-framejs_902a6d8-globalBjs_2d41ef9-sugjs_97bfd68-wwwjs_8d1160b; MSA_WH=1433_772; BAIDUID=E9B0B6A35D4ABC6ED4891FCC0FD085BD:FG=1; plus_cv=1::m:2a9fb36a; H_WISE_SIDS=107504_106305_100040_100100_109550_104341_107937_108437_109700_109794_107961_108453_109737_109558_109506_110022_107895_107917_109683_109588_110072_107318_107300_107242_100457; BDUSS=XNNMTJlWEdDdzFPdU1nSzVEZ1REYn4tNWNwZk94NVducXpaaThjWjE4bU1TQXRZQVFBQUFBJCQAAAAAAAAAAAEAAADLTBsKYTYzMTM4MTcwMgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIy741eMu-NXQ; BDRCVFR[ltbVPlNi2ac]=mk3SLVN4HKm; BDRCVFR[C0p6oIjvx-c]=mbxnW11j9Dfmh7GuZR8mvqV; BDRCVFR[uLXjBGr0i56]=mbxnW11j9Dfmh7GuZR8mvqV; rsv_jmp_slow=1474644236473; sug=3; sugstore=1; ORIGIN=0; bdime=21110; H_PS_645EC=60efFRJ1dM8ial205oBcDuRmtLgH3Q6NaRzxDuIkbMkGVXNSHmXBfW0GZL4l5pnj; BD_UPN=123253; BD_CK_SAM=1; BDSVRTM=110; H_PS_PSSID=17947",
        #"Host":"weixin.sogou.com",
        "Upgrade-Insecure-Requests":"1",
        "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
    }

    proxyHost = "http-dyn.abuyun.com"
    proxyPort = "9020"
    proxyUser = ""
    proxyPass = ""

    proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
      "host" : proxyHost,
      "port" : proxyPort,
      "user" : proxyUser,
      "pass" : proxyPass,
    }

    proxies = {
        "http"  : proxyMeta,
        "https" : proxyMeta,
    }

    html = requests.get(url,headers=headers,timeout=30,proxies=proxies)
    code = html.encoding
    return html.content


def get_shebei():
    ua = request.headers.get('User-Agent')
    reg_b = re.compile(r"(android|bb\\d+|meego).+mobile|avantgo|bada\\/|blackberry|blazer|compal|elaine|fennec|hiptop|iemobile|ip(hone|od)|iris|kindle|lge |maemo|midp|mmp|mobile.+firefox|netfront|opera m(ob|in)i|palm( os)?|phone|p(ixi|re)\\/|plucker|pocket|psp|series(4|6)0|symbian|treo|up\\.(browser|link)|vodafone|wap|windows ce|xda|xiino", re.I|re.M)

    reg_v = re.compile(r"1207|6310|6590|3gso|4thp|50[1-6]i|770s|802s|a wa|abac|ac(er|oo|s\\-)|ai(ko|rn)|al(av|ca|co)|amoi|an(ex|ny|yw)|aptu|ar(ch|go)|as(te|us)|attw|au(di|\\-m|r |s )|avan|be(ck|ll|nq)|bi(lb|rd)|bl(ac|az)|br(e|v)w|bumb|bw\\-(n|u)|c55\\/|capi|ccwa|cdm\\-|cell|chtm|cldc|cmd\\-|co(mp|nd)|craw|da(it|ll|ng)|dbte|dc\\-s|devi|dica|dmob|do(c|p)o|ds(12|\\-d)|el(49|ai)|em(l2|ul)|er(ic|k0)|esl8|ez([4-7]0|os|wa|ze)|fetc|fly(\\-|_)|g1 u|g560|gene|gf\\-5|g\\-mo|go(\\.w|od)|gr(ad|un)|haie|hcit|hd\\-(m|p|t)|hei\\-|hi(pt|ta)|hp( i|ip)|hs\\-c|ht(c(\\-| |_|a|g|p|s|t)|tp)|hu(aw|tc)|i\\-(20|go|ma)|i230|iac( |\\-|\\/)|ibro|idea|ig01|ikom|im1k|inno|ipaq|iris|ja(t|v)a|jbro|jemu|jigs|kddi|keji|kgt( |\\/)|klon|kpt |kwc\\-|kyo(c|k)|le(no|xi)|lg( g|\\/(k|l|u)|50|54|\\-[a-w])|libw|lynx|m1\\-w|m3ga|m50\\/|ma(te|ui|xo)|mc(01|21|ca)|m\\-cr|me(rc|ri)|mi(o8|oa|ts)|mmef|mo(01|02|bi|de|do|t(\\-| |o|v)|zz)|mt(50|p1|v )|mwbp|mywa|n10[0-2]|n20[2-3]|n30(0|2)|n50(0|2|5)|n7(0(0|1)|10)|ne((c|m)\\-|on|tf|wf|wg|wt)|nok(6|i)|nzph|o2im|op(ti|wv)|oran|owg1|p800|pan(a|d|t)|pdxg|pg(13|\\-([1-8]|c))|phil|pire|pl(ay|uc)|pn\\-2|po(ck|rt|se)|prox|psio|pt\\-g|qa\\-a|qc(07|12|21|32|60|\\-[2-7]|i\\-)|qtek|r380|r600|raks|rim9|ro(ve|zo)|s55\\/|sa(ge|ma|mm|ms|ny|va)|sc(01|h\\-|oo|p\\-)|sdk\\/|se(c(\\-|0|1)|47|mc|nd|ri)|sgh\\-|shar|sie(\\-|m)|sk\\-0|sl(45|id)|sm(al|ar|b3|it|t5)|so(ft|ny)|sp(01|h\\-|v\\-|v )|sy(01|mb)|t2(18|50)|t6(00|10|18)|ta(gt|lk)|tcl\\-|tdg\\-|tel(i|m)|tim\\-|t\\-mo|to(pl|sh)|ts(70|m\\-|m3|m5)|tx\\-9|up(\\.b|g1|si)|utst|v400|v750|veri|vi(rg|te)|vk(40|5[0-3]|\\-v)|vm40|voda|vulc|vx(52|53|60|61|70|80|81|83|85|98)|w3c(\\-| )|webc|whit|wi(g |nc|nw)|wmlb|wonu|x700|yas\\-|your|zeto|zte\\-", re.I|re.M)

    b = reg_b.search(ua)
    v = reg_v.search(ua[0:4])
    if b or v:
        return 'wap'
    else:
        return 'pc'


'''@@@@@@@@@@@@@@@----->>>>>> 全局函数END <<<<<<-----@@@@@@@@@@@@@@@@'''
@app.route('/')
def index():

    domain = request.base_url
    host = search('^([^/]*?)/', re.sub(r'(https|http)://(www|m)\.', '', domain),1)
    canshu = pagedata[host]

    ua = get_ua()
    shebei = get_shebei()
    page = "index"

    print ua,shebei,page

    domain = request.base_url
    words = findall_mysql_words()

    if 'http://www.' in domain:
        return render_template('1_pc_index.html',words=words,canshu=canshu)
    else:
        return render_template('1_pc_index.html',words=words,canshu=canshu)

@app.route('/list/<query_md5>.html')
def list(query_md5):
    domain = request.base_url
    host = search('^([^/]*?)/', re.sub(r'(https|http)://(www|m)\.', '', domain), 1)
    canshu = pagedata[host]

    ua = get_ua()
    domain = request.base_url

    word = md5_words(query_md5)['word']
    words = findall_mysql_words()


    '''判断该关键词是否存在列表内容,如果有,则返回内容;若没有,则抓取内容'''
    number = judge_lists_word(query_md5)
    print number
    if number == 0:

        # # 抓取图片
        # img_lists = []
        # url = 'https://image.so.com/j?q=%s&src=srp&pn=10' % word
        # html = getHTml(url)
        # html_dict = json.loads(html)
        # for y in html_dict['list']:
        #   img_lists.append(y['_thumb_bak'])

        # 开始抓取关键词内容,并入库
        wx_dict = {
            'word':word,
            'md5':query_md5,
            'text':[
                {}
            ]
        }

        # 抓取微信搜索结果(测试服务器内存不够,暂时关闭)
        # url = 'https://weixin.sogou.com/weixin?type=2&s_from=input&query=%s' % word
        # html = getHTml(url)
        # content = html
        # box = search(r'<ul class="news-list">([\s\S]*?)</ul>',content,1)
        # for i in re.findall(r'<li[^>]*?>([\s\S]*?)</li>',box):
        #   title = re.sub('<[^>]*?>','',search(r'<h3>([\s\S]*?)</h3>',i,1)).strip()
        #   descript = re.sub('<[^>]*?>','',search(r'<p[^>]*?>([\s\S]*?)</p>',i,1)).replace('"','\'').strip()
        #   auto = search(r'uigs="article_account_\d">(.*?)</a>',i,1).strip()
        #   imgurl = read_imgurl(random.randint(1,27015))
        #   textlink = 'https://weixin.sogou.com%s' % search(r'href="(/link\?[^"]*?)"',i,1).strip()

        #   wx_dict['text'].append({
        #       'title':title,
        #       'desc':descript,
        #       'auto':auto,
        #       'imgurl':imgurl,
        #       'textlink':textlink,
        #   })

        #   sql_wx = 'insert into text value ("%s","%s","%s","%s","%s","%s")'  %   (query_md5,title,descript,auto,imgurl,textlink)
        #   input_data(sql_wx)
        
        # 开始抓取百度
        url = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=dealio_dg&wd=%s' % word
        html_bd = getHTml(url)

        # 提取百度相关搜索词,并入库主词表
        xg_list = []
        xg_list_2 = []
        box_bd_xg = search(r'<table cellpadding="0">([\s\S]*?)</table>',html_bd,1)
        for xg in re.findall(r'<a[^>]*?>(.*?)</a>',box_bd_xg):
            xg = xg.strip()
            xgmd5 = md5(xg)
            xg_list.append({
                'xgword':xg,'xgmd5':xgmd5,
            })
            sql_inputword = 'INSERT INTO words VALUE ("%s","%s","%s")' % (xg,xgmd5,today)
            input_data(sql_inputword)
        
            xg_list_2.append("%s|%s" % (xg,xgmd5))
        

        # 相关词写入关联词表
        xg_str = '@'.join(xg_list_2)

        sql_xgword = 'INSERT INTO xg_word value ("%s","%s","%s") ' % (query_md5,word,xg_str)
        input_data(sql_xgword)
        
        # 提取百度搜索结果,并入库

        for x in re.findall(r'<div class="result c-container "[^>]*?>([\s\S]*?)</div></div>',html_bd):
            bd_title = re.sub('<[^>]*?>','',search(r'<h3[^>]*?>([\s\S]*?)</h3>',x,1))
            bd_desc = re.sub('<[^>]*?>','',search(r'<div class="c-abstract">([\s\S]*?)</div>',x,1))
            bd_auto = re.sub('<[^>]*?>','',re.sub('<style>[\s\S]*?</style>','',search(r'<div class="f13">([\s\S]*?)</div>',x,1)))
            bd_imgurl = read_imgurl(random.randint(1,27015))
            bd_textlink = search(r'href="(http://www.baidu.com/link\?url=[^"]*?)"',x,1)
            
            wx_dict['text'].append({
                        'title':bd_title,
                        'desc':bd_desc,
                        'auto':bd_auto,
                        'imgurl':bd_imgurl,
                        'textlink':bd_textlink, 
                        })

            sql_bd = 'insert into text value ("%s","%s","%s","%s","%s","%s")'  %   (query_md5,bd_title,bd_desc,bd_auto,bd_imgurl,bd_textlink)
            input_data(sql_bd)

        return render_template('1_pc_list.html',words=words,wx_dict=wx_dict['text'],word=word,xg_list=xg_list,canshu=canshu)
    else:
        word_dict = read_word_wxtext(query_md5)
        xg_list = read_xgword(query_md5)
        return render_template('1_pc_list.html',words=words,wx_dict=word_dict,word=word,xg_list=xg_list,canshu=canshu)

@app.errorhandler(404)
def page_not_found(e):
    words = findall_mysql_words()
    return render_template('404.html',words=words), 404

@app.route('/robots.txt')
def robots():
return render_template('robots.txt')

if __name__ == '__main__':
    app.debug = True
    app.run(host='0.0.0.0', port=5100,threaded=True)


database:

# 图片表
CREATE TABLE `imgurl` (
  `imgurl` varchar(255) DEFAULT NULL,
  `id` int(11) NOT NULL AUTO_INCREMENT,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=27016 DEFAULT CHARSET=utf8;


# 主关键词表
CREATE TABLE `words` (
  `word` varchar(255) DEFAULT NULL,
  `md5` varchar(255) DEFAULT NULL,
  `add_date` date DEFAULT NULL,
  UNIQUE KEY `word` (`word`(191)) USING BTREE,
  KEY `md5` (`md5`(191))
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;


# 关键词搜索结果数据
CREATE TABLE `text` (
  `md5` varchar(1024) DEFAULT NULL,
  `wx_title` varchar(1024) DEFAULT NULL,
  `wx_desc` varchar(5000) DEFAULT NULL,
  `wx_auto` varchar(255) DEFAULT NULL,
  `wx_imgurl` varchar(1024) DEFAULT NULL,
  `wx_textlink` varchar(1024) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;


# 相关关键词
CREATE TABLE `xg_word` (
  `md5` varchar(255) DEFAULT NULL,
  `word` varchar(255) DEFAULT NULL,
  `xgword` varchar(1024) DEFAULT NULL,
  UNIQUE KEY `md5` (`md5`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;


nginx conf(未开启泛解析和移动端跳转):

server {
    listen 80;

    server_name  www.domain.com;

    access_log  /www/wwwlogs/domain.log;
    # access_log off;
    error_log  /www/wwwlogs/domain_error.log;

    root /www/wwwroot/www.domain.com;

    if ($host ~* ([a-z0-9][a-z0-9\-]+?\.(?:com|cn|net|org|info|la|cc|co|gz|ah|gd|nm|sh|tj|gov|sx|gz|sh|sc|faith|date|space)(?:\.cn)?)$ ) {
       set $domain $1;
    }
    
    if ($host ~* ^([a-z0-9][a-z0-9\-]+?\.(?:com|cn|net|org|info|la|cc|co|gz|ah|gd|nm|sh|tj|gov|sx|gz|sh|sc|faith|date|space)(?:\.cn)?)$){
       rewrite ^/(.*)$ http://www.$domain/$1 permanent;
    }
    

    #禁止Scrapy等工具的抓取,注意已经取消curl抓取
    if ($http_user_agent ~* (Scrapy|HttpClient)) {
         return 403;
    }

    #禁止恶意user_agent访问
    if ($http_user_agent ~* (DotBot|MegaIndex|MJ12bot|NHN|Twiceler|ToutiaoSpider|AhrefsBot|YandexBot)) {
            return  403;
    }

    #禁止非GET|HEAD|POST方式的抓取
    if ($request_method !~ ^(GET|HEAD|POST)$) {
        return 403;
    }


    location / {
        proxy_pass         http://127.0.0.1:5100/;
        proxy_redirect     off;

        proxy_set_header   Host             $host;
        proxy_set_header   X-Real-IP        $remote_addr;
        proxy_set_header   X-Forwarded-For  $proxy_add_x_forwarded_for;
    }
}


文章版权及转载声明:

作者:GoGo闯本文地址:https://www.gogochuang.com/post/59.html发布于 4个月前 ( 04-25 )
文章转载或复制请以超链接形式并注明出处GoGo闯SEO

发表评论

快捷回复:

评论列表 (有 1 条评论,391人围观)参与讨论
网友昵称:思明
思明游客2019-07-25沙发 回复
几个月过去了,效果如何呢?