💎一站式轻松地调用各大LLM模型接口,支持GPT4、智谱、星火、月之暗面及文生图 广告
冗余数据方式的来建模,其实用的就是object类型,我们这里又要引入一种新的object类型,nested object类型个人感觉就像结构化数据粘合一样,属性之间关联,实体之间隔离 ## 1\. object与nested object 1. 首先插入一条数据 ~~~ PUT /website/blogs/6 { "title": "花无缺发表的一篇帖子", "content": "我是花无缺,大家要不要考虑一下投资房产和买股票的事情啊。。。", "tags": [ "投资", "理财" ], "comments": [ { "name": "小鱼儿", "comment": "什么股票啊?推荐一下呗", "age": 28, "stars": 4, "date": "2016-09-01" }, { "name": "黄药师", "comment": "我喜欢投资房产,风,险大收益也大", "age": 31, "stars": 5, "date": "2016-10-22" } ] } ~~~ 查询被年龄是28岁的黄药师评论过的博客,搜索 ~~~ GET /website/blogs/_search { "query": { "bool": { "must": [ { "match": { "comments.name": "黄药师" }}, { "match": { "comments.age": 28 }} ] } } } ~~~ 得到 ~~~ "_index": "website", "_type": "blogs", "_id": "6", "_score": 1.8022683, "_source": { "title": "花无缺发表的一篇帖子", "content": "我是花无缺,大家要不要考虑一下投资房产和买股票的事情啊。。。", "tags": [ "投资", "理财" ], "comments": [ { "name": "小鱼儿", "comment": "什么股票啊?推荐一下呗", "age": 28, "stars": 4, "date": "2016-09-01" }, { "name": "黄药师", "comment": "我喜欢投资房产,风,险大收益也大", "age": 31, "stars": 5, "date": "2016-10-22" } ] } } ] } ~~~ 黄药师的年龄是31,但是用28就搜出来了,这显然是不行的,为什么会这样呢? object类型数据结构的底层存储,文档都被扁平化了 object类型底层数据结构,会将一个json数组中的数据,进行扁平化 ~~~ { "title": [ "花无缺", "发表", "一篇", "帖子" ], "content": [ "我", "是", "花无缺", "大家", "要不要", "考虑", "一下", "投资", "房产", "买", "股票", "事情" ], "tags": [ "投资", "理财" ], "comments.name": [ "小鱼儿", "黄药师" ], "comments.comment": [ "什么", "股票", "推荐", "我", "喜欢", "投资", "房产", "风险", "收益", "大" ], "comments.age": [ 28, 31 ], "comments.stars": [ 4, 5 ], "comments.date": [ 2016-09-01, 2016-10-22 ] } ~~~ 所以我们age=28,也能搜索到 2. 引入nested object类型,来解决object类型底层数据结构导致的问题 * 删除index,修改映射,把comments改成nested类型 ~~~ PUT /website { "mappings": { "blogs": { "properties": { "comments": { "type": "nested", "properties": { "name": { "type": "string" }, "comment": { "type": "string" }, "age": { "type": "short" }, "stars": { "type": "short" }, "date": { "type": "date" } } } } } } } ~~~ 此时,comments底层存储发生改变(属性粘合到了一起) ~~~ "comments.name": [ "小鱼儿" ], "comments.comment": [ "什么", "股票", "推荐" ], "comments.age": [ 28 ], "comments.stars": [ 4 ], "comments.date": [ 2014-09-01 ] } { "comments.name": [ "黄药师" ], "comments.comment": [ "我", "喜欢", "投资", "房产", "风险", "收益", "大" ], "comments.age": [ 31 ], "comments.stars": [ 5 ], "comments.date": [ 2014-10-22 ] } { "title": [ "花无缺", "发表", "一篇", "帖子" ], "body": [ "我", "是", "花无缺", "大家", "要不要", "考虑", "一下", "投资", "房产", "买", "股票", "事情" ], "tags": [ "投资", "理财" ] } ~~~ * 重新插入数据 * 查询方式改变 ~~~ GET /website/blogs/_search { "query": { "bool": { "must": [ { "match": { "title": "花无缺" } }, { "nested": { "path": "comments", "query": { "bool": { "must": [ { "match": { "comments.name": "黄药师" } }, { "match": { "comments.age": 31 } } ] } } } } ] } } } ~~~ 此时,只有age=31可以查询到我们想要的数据 ~~~ "_score": 3.6845634, "_source": { "title": "花无缺发表的一篇帖子", "content": "我是花无缺,大家要不要考虑一下投资房产和买股票的事情啊。。。", "tags": [ "投资", "理财" ], "comments": [ { "name": "小鱼儿", "comment": "什么股票啊?推荐一下呗", "age": 28, "stars": 4, "date": "2016-09-01" }, { "name": "黄药师", "comment": "我喜欢投资房产,风,险大收益也大", "age": 31, "stars": 5, "date": "2016-10-22" } ] } } ] } } ~~~ ## 2\. nested object 聚合分析 1. 聚合数据分析的需求:按照评论日期进行bucket划分,然后拿到每个月的评论的评分的平均值 ~~~ GET /website/blogs/_search { "size": 0, "aggs": { "comments_path": { "nested": { "path": "comments" # 指定类型为nested }, "aggs": { # 按照时间聚合 "group_by_comments_date": { "date_histogram": { "field": "comments.date", "interval": "month", "format": "yyyy-MM" }, "aggs": { "avg_stars": { "avg": { "field": "comments.stars" } } } } } } } } ~~~ 得到 ~~~ "aggregations": { "comments_path": { "doc_count": 2, "group_by_comments_date": { "buckets": [ { "key_as_string": "2016-09", "key": 1472688000000, "doc_count": 1, "avg_stars": { "value": 4 } }, { "key_as_string": "2016-10", "key": 1475280000000, "doc_count": 1, "avg_stars": { "value": 5 } } ] } } } } ~~~ 2. 按照年龄段来分组,然后按照tags分组 ~~~ GET /website/blogs/_search { "size": 0, "aggs": { "comments_path": { "nested": { "path": "comments" # 绑定为nested聚合comments对象中聚合 }, "aggs": { "group_by_comments_age": { "histogram": { "field": "comments.age", "interval": 10 }, "aggs": { "reverse_path": { "reverse_nested": {}, # 撤销nested,在comments对象外聚合 "aggs": { "group_by_tags": { "terms": { "field": "tags.keyword" } } } } } } } } } } ~~~ 得到,20岁,投资理财各一个,因为只有一个doc,且`tag:[ "投资", "理财" ],` ~~~ "group_by_comments_age": { "buckets": [ { "key": 20, "doc_count": 1, "reverse_path": { "doc_count": 1, "group_by_tags": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ { "key": "投资", "doc_count": 1 }, { "key": "理财", "doc_count": 1 } ] } } }, { "key": 30, "doc_count": 1, "reverse_path": { "doc_count": 1, "group_by_tags": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ { "key": "投资", "doc_count": 1 }, { "key": "理财", "doc_count": 1 } ~~~ 3. 按照年龄段和评分 ~~~ GET /website/_search { "size": 0, "aggs": { "comments_nested": { "nested": { "path": "comments" # 绑定为nested聚合comments对象中聚合 }, "aggs": { "groupby_age": { "histogram": { "field": "comments.age", "interval": 10 }, "aggs": { "groupby_start": { "terms": { "field": "comments.stars" # 因为在comments内,不用解绑 } } } } } } } } ~~~