自定义模拟网站案例 · python-网页到放弃

[TOC] >[success] # 仿照网站请求 ~~~ 1.处理url，通过url 获取 http/https host port path 四个参数 2.例如 https://www.kancloud.cn/book，首先获取http/https，在吧获取的连接 www.kancloud.cn/book，在进行拆分，获取域名和端口 ~~~ >[info] ## 编写客户端 >[danger] ##### 处理url ~~~ 1.socket 仿照浏览器处理，获取服务器请求 2.对url拆分，成协议，host,端口，和文件目录 3.利用find，来进行目录拆分，获取位置信息后，进行切片获取 4.将固定的东西放在字典中，省去if判断逻辑 ~~~ ~~~ def parsed_url(url): """ :param url: 获取网址的url :return: 返回所需要的 http/https host port path 四个参数 """ # 获取http 或者https protocol = "http" if url[:7] == "http://": u = url.split("://")[1] if url[:8] == "https://": u = url.split("://")[1] protocol = "https" else: u = url # 把端口加域名和网站的文件目录分离 i = u.find("/") if i == -1: host = u path = "/" else: host = u[:i] path = u[i:] # 处理host port_dict = { 'http': 80, 'https': 443, } port = port_dict[protocol] if ":" in host: h = host.split(":") host = h[0] port = int(h[1]) return protocol, host, port, path ~~~ >[danger] ##### 尝试编写单元测试 ~~~ def test_parsed_url(): http = 'http' https = 'https' host = 'g.cn' path = '/' test_items = [ ('http://g.cn', (http, host, 80, path)), ('http://g.cn/', (http, host, 80, path)), ('http://g.cn:90', (http, host, 90, path)), ('http://g.cn:90/', (http, host, 90, path)), # ('https://g.cn', (https, host, 443, path)), ('https://g.cn:233/', (https, host, 233, path)), ] for t in test_items: url, expected = t u = parsed_url(url) e = "parsed_url ERROR, ({}) ({}) ({})".format(url, u, expected) assert u == expected, e ~~~ >[danger] ##### 编写client ~~~ 1.拆分一个做协议判断的方法socket_by_protocol 2.一个循环获得服务器返回所有信息的方法response_by_socket 3.一个解析 header body 请求状态码的方法，巧妙利用元组可以被多个元素接受 4.在整体的大方法中，我们也可以用列表存储代替if判断 ~~~ ~~~ import socket,ssl def socket_by_protocol(protocol): """ 判断使用http 还是https 协议 """ if protocol == "http": s = socket.socket() else: s = ssl.wrap_socket(socket.socket()) return s def response_by_socket(s): """ 参数是一个 socket 实例返回这个 socket 读取的所有数据 """ response = b'' buffer_size = 1024 while True: r = s.recv(buffer_size) if len(r) == 0: break response += r return response def parsed_response(r): """ 把 response 解析出状态码 headers body 返回状态码是 int headers 是 dict body 是 str """ header, body = r.split('\r\n\r\n', 1) h = header.split('\r\n') status_code = h[0].split()[1] status_code = int(status_code) headers = {} for line in h[1:]: k, v = line.split(': ') headers[k] = v return status_code, headers, body def get(url): protocol, host, port, path = parsed_url(url) s = socket_by_protocol(protocol) s.connect((host, port)) # 不用持续连接Connection: close request = 'GET {} HTTP/1.1\r\nHost: {}\r\nConnection: close\r\n\r\n'.format(path, host) s.send(request.encode("utf-8")) response = response_by_socket(s) r = response.decode("utf-8") parsed_response(r) status_code, headers, body = parsed_response(r) if status_code in [301, 302]: url = headers['Location'] return get(url) return status_code, headers, body ~~~ >[danger] ##### 服务端 -- server.py ~~~ 1.利用 with 去做socket 连接，为了当端口用完进行关闭 2.接受到客户端访问服务器的请求头，对请求头的路径拆分出来 3.通过路由，将请求头传递到路由映射函数 4.response_for_path 路由映射的方法，通过字典存储和路径对应的，处理函数，其中k 保存的是函数地址，这样减少执行函数 5.在路由函数中利用字典get 的属性返回404 函数 6.将数据返回给客户端 ~~~ ~~~ def log(*args, **kwargs): """ 用这个 log 替代 print """ print('log', *args, **kwargs) def route_index(): """ 主页的处理函数, 返回主页的响应 """ header = 'HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n' body = '<h1>Hello Gua</h1><img src="/doge.gif">' r = header + '\r\n' + body return r.encode(encoding='utf-8') def page(name): with open(name, encoding='utf-8') as f: return f.read() def route_msg(): """ msg 页面的处理函数 """ header = 'HTTP/1.1 200 OK\r\nContent-Type: text/html\r\n' body = page('html_basic.html') r = header + '\r\n' + body return r.encode(encoding='utf-8') def route_image(): """ 图片的处理函数, 读取图片并生成响应返回 """ with open('doge.gif', 'rb') as f: header = b'HTTP/1.1 200 OK\r\nContent-Type: image/gif\r\n' img = header + b'\r\n' + f.read() return img def error(code=404): """ 根据 code 返回不同的错误响应目前只有 404 """ # 之前上课我说过不要用数字来作为字典的 key # 但是在 HTTP 协议中 code 都是数字似乎更方便所以打破了这个原则 e = { 404: b'HTTP/1.1 404 NOT FOUND\r\n\r\n<h1>NOT FOUND</h1>', } return e.get(code, b'') def response_for_path(path): """ 根据 path 调用相应的处理函数没有处理的 path 会返回 404 """ r = { '/': route_index, '/doge.gif': route_image, '/msg': route_msg, } response = r.get(path, error) return response() def run(host='', port=3000): """ 启动服务器 """ # 初始化 socket 套路 # 使用 with 可以保证程序中断的时候正确关闭 socket 释放占用的端口 with socket.socket() as s: s.bind((host, port)) # 无限循环来处理请求 while True: # 监听接受读取请求数据解码成字符串 s.listen(5) connection, address = s.accept() request = connection.recv(1024) log('raw, ', request) request = request.decode('utf-8') log('ip and request, {}\n{}'.format(address, request)) try: # 因为 chrome 会发送空请求导致 split 得到空 list # 所以这里用 try 防止程序崩溃 path = request.split()[1] # 用 response_for_path 函数来得到 path 对应的响应内容 response = response_for_path(path) # 把响应发送给客户端 connection.sendall(response) except Exception as e: log('error', e) # 处理完请求, 关闭连接 connection.close() def main(): # 生成配置并且运行程序 config = dict( host='', port=3000, ) # 如果不了解 **kwargs 的用法, 群里问或者看书/搜索关键字参数 run(**config) if __name__ == '__main__': main() ~~~