我感觉就是考察hive的行转列,列转行。。不知道对不对 INSERT OVERWRITE table dbname.ArticleFeatures PARTITION(dt='{hivevar:job_date}') SELECT articleid,concat_ws(",",collect_set(A3.url_features)) FROM ( SELECT articleid,concat(A1.url,":",A2.feature) AS url_features FROM ( select articleid,url from dbname.Articles LATERAL VIEW explode(split(image_urls,",")) tabletest AS url )A1 LEFT JOIN dbname.ImageFeatures A2 ON A1.url=A2.url )A3 GROUP BY A3.articleid
set hive.auto.convert.join=true; set hive.mapjoin.smalltable.filesize=300000000; set hive.auto.convert.join.noconditionaltask=true; set hive.auto.convert.join.noconditionaltask.size=300000000; INSERT INTO TABLE ArticleFeatures PARTITION(dt='${etl_dt}') SELECT articleid AS articleid ,concat_ws(',',collect_set(t2.url_feature)) AS url_features FROM (SELECT DISTINCT articled AS articled ,image_urls AS image_url FROM Articles LATERAW VIEW explode(splite(image_urls,',')) ) t1 LEFT JOIN (SELECT url ,concat_ws(':',url,feature) as url_feature FROM ImageFeatures) t2 ON t1.image_url = t2.url group by articleid 直接手写的没跑过,肯定有问题(PS:我基本每条sql都要测试几遍,很少会有一边通过的样子不知道你们是不是一样),但是思路基本都是一样的,就是行转列再拼起来。关键是这个调优怎么做,很想知道一下大神的思路 个人调优思路:1、提前去重,减少数据量。数据本身较少,直接使用distinct 2、在设置中开启mapjoin,把小表直接加载进内存中join 感觉除此之外没啥要优化的了