検証結果
| # |
クエリ |
実行時間 |
I/O量 |
| 1 |
select count(*) from amazon_reviews_parquet |
5.6秒 |
0KB |
| 2 |
select count(year) from amazon_reviews_parquet |
6.63秒 |
2.58MB |
| 3 |
select count(review_body) from amazon_reviews_parquet |
5.7秒 |
34.05GB |
| 4 |
select * from amazon_reviews_parquet limit 10000 |
2.05秒 |
455.11MB |
| 5 |
select year from amazon_reviews_parquet limit 10000 |
0.92秒 |
163.79KB |
| 6 |
select review_body from amazon_reviews_parquet limit 10000 |
0.54秒 |
3.25MB |

準備手順
$ aws s3 mb s3://amazon-reviews-pds-az
$ aws s3 cp --recursive s3://amazon-reviews-pds/ s3://amazon-reviews-pds-az
CREATE EXTERNAL TABLE amazon_reviews_parquet(
marketplace string,
customer_id string,
review_id string,
product_id string,
product_parent string,
product_title string,
star_rating int,
helpful_votes int,
total_votes int,
vine string,
verified_purchase string,
review_headline string,
review_body string,
review_date bigint,
year int)
PARTITIONED BY (product_category string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
's3://amazon-reviews-pds-az/parquet/'
MSCK REPAIR TABLE amazon_reviews_parquet
情報採取
- select count(*) from amazon_reviews_parquet
select eventname, count(eventname) as apicall_count from default.cloudtrail_logs_cloudtrail_123456789012_do_not_delete
where eventsource = 's3.amazonaws.com'
and useragent = 'athena.amazonaws.com'
and awsregion = 'us-east-1'
and eventtime > '2018-09-23T07:12:00Z'
and eventtime < '2018-09-23T07:35:00Z'
group by eventname
order by apicall_count desc
| eventname |
apicall_count |
| GetObject |
2230 |
| HeadObject |
2203 |
| HeadBucket |
176 |
| ListObjects |
43 |
| PutObject |
2 |
- select count(year) from amazon_reviews_parquet
select eventname, count(eventname) as apicall_count from default.cloudtrail_logs_cloudtrail_123456789012_do_not_delete
where eventsource = 's3.amazonaws.com'
and useragent = 'athena.amazonaws.com'
and awsregion = 'us-east-1'
and eventtime > '2018-09-23T07:35:00Z'
and eventtime < '2018-09-23T07:55:00Z'
group by eventname
order by apicall_count desc
| eventname |
apicall_count |
| GetObject |
2894 |
| HeadObject |
2238 |
| HeadBucket |
229 |
| ListObjects |
43 |
| PutObject |
2 |
- select count(review_body) from amazon_reviews_parquet
select eventname, count(eventname) as apicall_count from default.cloudtrail_logs_cloudtrail_123456789012_do_not_delete
where eventsource = 's3.amazonaws.com'
and useragent = 'athena.amazonaws.com'
and awsregion = 'us-east-1'
and eventtime > '2018-09-23T07:50:00Z'
and eventtime < '2018-09-23T08:00:00Z'
group by eventname
order by apicall_count desc
| eventname |
apicall_count |
| GetObject |
3027 |
| HeadObject |
2248 |
| HeadBucket |
668 |
| ListObjects |
43 |
| PutObject |
2 |
- select * from amazon_reviews_parquet limit 10000
select eventname, count(eventname) as apicall_count from default.cloudtrail_logs_cloudtrail_123456789012_do_not_delete
where eventsource = 's3.amazonaws.com'
and useragent = 'athena.amazonaws.com'
and awsregion = 'us-east-1'
and eventtime > '2018-09-23T08:10:00Z'
and eventtime < '2018-09-23T08:20:00Z'
group by eventname
order by apicall_count desc
| eventname |
apicall_count |
| GetObject |
725 |
| HeadObject |
607 |
| HeadBucket |
105 |
| ListObjects |
43 |
| PutObject |
2 |
- select year from amazon_reviews_parquet limit 10000
select eventname, count(eventname) as apicall_count from default.cloudtrail_logs_cloudtrail_123456789012_do_not_delete
where eventsource = 's3.amazonaws.com'
and useragent = 'athena.amazonaws.com'
and awsregion = 'us-east-1'
and eventtime > '2018-09-23T08:20:00Z'
and eventtime < '2018-09-23T08:30:00Z'
group by eventname
order by apicall_count desc
| eventname |
apicall_count |
| HeadObject |
2095 |
| GetObject |
1934 |
| HeadBucket |
327 |
| ListObjects |
43 |
| PutObject |
2 |
- select review_body from amazon_reviews_parquet limit 10000
select eventname, count(eventname) as apicall_count from default.cloudtrail_logs_cloudtrail_123456789012_do_not_delete
where eventsource = 's3.amazonaws.com'
and useragent = 'athena.amazonaws.com'
and awsregion = 'us-east-1'
and eventtime > '2018-09-23T08:40:00Z'
and eventtime < '2018-09-23T08:59:00Z'
group by eventname
order by apicall_count desc
| eventname |
apicall_count |
| GetObject |
1153 |
| HeadObject |
1117 |
| HeadBucket |
500 |
| ListObjects |
43 |
| PutObject |
2 |