import re import base64 import io from fontTools.ttLib import TTFont import requests def getData(url): print(url) #url = 'https://b2b.huangye88.com/qiye2497072/company_contact.html' s = requests.session() s.keep_alive = False response = requests.get(url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}) response_ = response.text if len(response_) < 5000: print("已被删除"+url) return ''' pageLost=["已被删除","企业不存在","企业含违禁内容","分类已关闭"] for page in pageLost: if page in response_: print("已被删除"+url) return ''' font_data=re.findall('8;base64,(.*?)"',response.text) convert_dict = { 'hyphen':'-', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'period': '.', 'zero': '0' } #print(font_data[0]) #font_face='AAEAAAAKAIAAAwAgT1MvMla194sAAACsAAAAYGNtYXALMAPOAAABDAAAAa5nbHlmZrwdwAAAArwAAAakaGVhZBQx4JoAAAlgAAAANmhoZWEFswFxAAAJmAAAACRobXR4DVYBYgAACbwAAAAubG9jYQwQCnYAAAnsAAAAIm1heHAAFABOAAAKEAAAACBuYW1lUuodRwAACjAAAAGecG9zdDHgxUkAAAvQAAAAdAAEAgsBkAAFAAACmQLMAAAAjwKZAswAAAHrADMBCQAAAgAGAwAAAAAAAAAAAAEQAAAAAAAAAAAAAABQZkVkAMAAI4N8Ayz/LABcAywA1AAAAAEAAAAAAxgAAAAAACAAAQAAAAQAAAADAAAAJAABAAAAAABcAAMAAQAAACQAAwAKAAABYgAEADgAAAAKAAgAAgACACMAKwAtAC///wAAACMAKgAtAC/////e/9j/1//WAAEAAAAAAAAAAAAAAAABBgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAgMABAAFAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMAAAAAABMAAAAAAAAAAUAAAAjAAAAIwAAAAEAAAAqAAAAKwAAAAIAAAAtAAAALQAAAAQAAAAvAAAALwAAAAUACINzAAiDfAAAAAYAAAACACIAAAEyAqoAAwAHAAA3ESERJzMRIyIBEO7MzAACqv1WIgJmAAAAAgAdAAACIALbABsAHwAAARUjByM3IwcjNyM1MzcjNTM3MwczNzMHMxUjByMzNyMB/4AmSCZrJ0knZnQjdoQkSSVrJkkmYnAitWwkbAEUR83Nzc1HuUjGxsbGSLm5AAAAAQAkAKQB3gI2ABEAABM3FyczBzcXBxcHJxcjNwcnNyQumSJzJZkun58umSRyIZguoAGXZ26mpGpmKClma6anbWYqAAABAEMAkwH6AkoACwAAARUjNSM1MzUzFTMVAUNKtrZKtwFKt7dJt7dJAAAAAAEAGgFCASQBrQADAAATNSEVGgEKAUJrawAAAAABAAD/gwEnAwoAAwAAFycTM0pK30h9AQOGAAAAAgAj//YCGgLmABMAJwAAARQOAiMiLgI1ND4CMzIeAgUUHgIzMj4CNTQuAiMiDgICGhw9X0NGYDwaGjxgR0JfPRz+qAgUJB0cJBUHBxQkHB0kFQgBb1WLYzY2Y4xVVYpiNTVii1VKc08qKk9zSklzTykpT3MAAAAAAQArAAACCgLfACEAADc1MzI+AjURDgMjIi4CNT4DPwEzERQeAjsBFWRUDRMNBhQiIB8PDRUQChAiJiwaSHIFCxUQUgA3Bg8aEwIBGCccDwoUHBEEDBIbEjX9mhAZEQg3AAAAAAEAJAAAAg4C5gArAAABFA4EDwEzMjY/ATMHITU3PgM1NCYjIgYVIi4CNTQ+AjMyHgIB9AsYKDtPM2fvHy0JCD0G/hyYLz0jDiomNCodMCMTHThUODpXPB4CPBgtMDZATjFhJCMf12qaMU5HRSg6NllYCxgnGxwyJhcYLD8AAAAAAQAd//YCDgLmAEQAABciLgI1ND4CMxQeAjMyPgI1NC4CKwE1MzI+AjU0JiMiDgIVIiY1ND4CMzIeAhUUDgIHHgMVFA4C+TpTNhkOGB8SEiEvHBktIxUVKDsnP0MhMSAQKyobIxMHQEUdOVQ4N1c+IRgqOSIfQTUiL01kChQiLRgTHhUKITEhEA4iOiweMSMUQBUoOCE4PxstOR4tLxsvJBQWKz4oIzouIgwFGSo/LD5VNBYAAgAOAAACKQLbABgAIwAAJRUUHgI7ARUhNTMyPgI9ASE1ATMRMxUlNDY3DgMPATMBvw0XHxEN/pkcEh4XDf7lASKPav8AAwQFFhkXBorUvz8YHQ8FNzcFDx0YPz4B3v4nQ/YtaDAMKiwoCeUAAQAp//YCBgLbADoAADcyPgI1NCYjIg4CBycTIRcjJy4DKwEUDgIPAT4DMzIeAhUUDgIjIi4CNTQ2MxQeAuwZLiIVSUMTIBsYCy8gAYQFOwgCBgsQDNUCAgMBCAgZHiIPPGBFJTBNXy85UDIXLSUMGis+ECVAL0xLAwUHAxIBYrojCQ4KBgEQGyISXgMGBAMcNlI3Q1o3GBUiLRgkIxYsIxYAAAACAC7/9gIZAuYALAA8AAABIg4CBz4DMzIeAhUUDgIjIi4CNTQ+AjMyHgIVFA4CIzQuAgMiDgIHFB4CMzI2NTQmAUkeMSMVAwobIysaL0s2HR48WDs5XUMlJEhuSjJFKxMNHS4iBg8bNw4fHBgGEh4pFygtMgKpJEVkQQcNCwcdN04yN1tBJCpWg1lVk20/EyAoFhAdFg0XLyYY/tkIDhIJSWpEIFBZU0wAAAAAAQAtAAACGwLbAAsAADcBISIGDwEjNyEVAakBEf7yHBwDBj4FAen+5QACbBsZNNcy/VcAAAMAH//2Ah4C5gAlADkATQAANzQ+AjcuATU0PgIzMh4CFRQOAgceAxUUDgIjIi4CFzI+AjU0LgInDgMVFB4CEzQuAiMiDgIVFB4CFz4DHxUoOCE9QRg4W0I2UjcbEyQzIC5BKBMkQ2E+QF4+Hf4aKx4QESU4KBEeFQ0RHit6DBgkFxUhFgsOHCkbExsSCLshNSslESNaPCRDNCAbMEInHi8nIRAXLTI2HzFLNBwfNUhiEyIvHBkpIyISCx0jLBseMiMUAgQWKyEUER8qGBsoIBkNCxkgKAAAAAIAJP/2Ag8C5gAoADYAABciLgI1NDY3HgMzMjY3DgMjIi4CNTQ+AjMyHgIVFA4CAzI2NzQuAiMiBhUUFukvQCgRGBoHFR4nGkVKBQwdJS0aLEo1HiA9Vzc3XkUmIUdvHyU1DxEcKBgsMDAKFCAqFhYfBRcoHRGVkw8ZEwobNk80N1tCJChUglpVlG9AAW4lH0JePB1WV0dJAAAAAAEAAAABAACkAILFXw889QALBAAAAAAA2XTOiAAAAADZdM6IAAD/gwIpAwoAAAAIAAIAAAAAAAAAAQAAAyz/LABcAj0AAAAAAikAAQAAAAAAAAAAAAAAAAAAAAcBdgAiAj0AHQICACQCPQBDAT4AGgEnAAACPQAjACsAJAAdAA4AKQAuAC0AHwAkAAAAAAAUAEQAZgB8AIoAmADUAQYBRgGgAdYCKAJ+ApgDBANSAAAAAQAAABAATgADAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAwAlgABAAAAAAABAA0AAAABAAAAAAACAAYADQABAAAAAAADAA0AEwABAAAAAAAEAA0AIAABAAAAAAAFAB4ALQABAAAAAAAGAA0ASwADAAEECQABABoAWAADAAEECQACAAwAcgADAAEECQADABoAfgADAAEECQAEABoAmAADAAEECQAFADwAsgADAAEECQAGABoA7kxlZVRyZWVzaGFkb3dNZWRpdW1MZWVUcmVlc2hhZG93TGVlVHJlZXNoYWRvd1ZlcnNpb24gMS4wOyBGb250RWRpdG9yICh2MS4wKUxlZVRyZWVzaGFkb3cATABlAGUAVAByAGUAZQBzAGgAYQBkAG8AdwBNAGUAZABpAHUAbQBMAGUAZQBUAHIAZQBlAHMAaABhAGQAbwB3AEwAZQBlAFQAcgBlAGUAcwBoAGEAZABvAHcAVgBlAHIAcwBpAG8AbgAgADEALgAwADsAIABGAG8AbgB0AEUAZABpAHQAbwByACAAKAB2ADEALgAwACkATABlAGUAVAByAGUAZQBzAGgAYQBkAG8AdwAAAAIAAAAAAAAAMgAAAAAAAAAAAAAAAAAAAAAAAAAAABAAEAAAAAYADQAOABAAEgECAQMBBAEFAQYBBwEIAQkBCgELBHplcm8Db25lA3R3bwV0aHJlZQRmb3VyBGZpdmUDc2l4BXNldmVuBWVpZ2h0BG5pbmU=' base64_str = font_data[0] # 提取前端中的字体加密字符串 b = base64.b64decode(base64_str) # base64解码 font = TTFont(io.BytesIO(b)) # 读取ttf数据 bestcmap = font['cmap'].getBestCmap() # 得到camp标签下的数据 #print(bestcmap) newmap = dict() for key in bestcmap.keys(): newmap[hex(key)] = bestcmap[key] #print(newmap) # 把页面上自定义字体替换成正常字体,这一步就很简单了 for key,value in newmap.items(): key_ = key.replace('0x','&#x') + ';' if key_ in response_: #print(value) response_ = response_.replace(key_,convert_dict[value]) #print(response_) print("success") for i in range(80000): url = 'https://b2b.huangye88.com/qiye{}/company_contact.html'.format(2497656+i) getData(url) print("---------")