S3 select content from an object

The select object content API filters the content of an object through the structured query language (SQL).

See the Metadata collected by inventory section in the AWS Systems Manager User Guide for an example of the description of what should reside in the inventory object. In the request, you must specify the data serialization format as, comma-separated values (CSV) of the object to retrieve the specified content. Amazon Web Services (AWS) command-line interface (CLI) select object content uses the CSV format to parse object data into records and returns only the records specified in the query.

Note: You must specify the data serialization format for the response. You must have s3:GetObject permission for this operation.

Syntax

POST /BUCKET/KEY?select&select-type=2 HTTP/1.1\r\n

Example

POST /testbucket/sample1csv?select&select-type=2 HTTP/1.1\r\n

Bucket

Description

The bucket to select object content from.

Type

String

Required

Yes

Key

Description

The object key.

Length Constraints

Minimum length of 1.

Type

String

Required

Yes

SelectObjectContentRequest

Description

Root level tag for the select object content request parameters.

Type

String

Required

Yes

Expression

Description

The expression that is used to query the object.

Type

String

Required

Yes

ExpressionType

Description

The type of the provided expression for example SQL.

Type

String

Valid Values

SQL

Required

Yes

InputSerialization

Description

Describes the format of the data in the object that is being queried.

Type

String

Required

Yes

OutputSerialization

Description

Format of data returned in comma separator and new-line.

Type

String

Required

Yes

Response entities

If the action is successful, the service sends back HTTP 200 response. Data is returned in XML format by the service:

Payload

Description

Root level tag for the payload parameters.

Type

String

Required

Yes

Records

Description

The records event.

Type

Base64-encoded binary data object

Required

No

Stats

Description

The stats event.

Type

Long

Required

No

The Ceph Object Gateway supports the following response:

Example

{:event-type,records} {:content-type,application/octet-stream} {:message-type,event}

Syntax (for CSV)

aws --endpoint-URL http://localhost:80 s3api select-object-content
 --bucket BUCKET NAME
 --expression-type SQL
 --input-serialization
 {"CSV": {"FieldDelimiter": "," , "QuoteCharacter": "\"" , "RecordDelimiter" : "\n" , "QuoteEscapeCharacter" : "\\" , "FileHeaderInfo": "USE" }, "CompressionType": "NONE"}
 --output-serialization {"CSV": {}}
 --key OBJECT_NAME.csv
 --expression "select count(0) from s3object where int(_1)<10;" output.csv
Note: The object names in --key OBJECT_NAME do not require .csv, .json, or .parquet from IBM Storage Ceph 7 and later.

Example (for CSV)

aws --endpoint-url http://localhost:80 s3api select-object-content
 --bucket testbucket
 --expression-type 'SQL'
 --input-serialization
 '{"CSV": {"FieldDelimiter": "," , "QuoteCharacter": "\"" , "RecordDelimiter" : "\n" , "QuoteEscapeCharacter" : "\\" , "FileHeaderInfo": "USE" }, "CompressionType": "NONE"}'
 --output-serialization '{"CSV": {}}'
 --key testobject.csv
 --expression "select count(0) from s3object where int(_1)<10;" output.csv
Syntax (for Parquet)
aws --endpoint-url http://localhost:80 s3api select-object-content
 --bucket BUCKET NAME
 --expression-type 'SQL'
 --input-serialization
 '{"Parquet": {}, {"CompressionType": "NONE"}'
 --output-serialization '{"CSV": {}}}'
 --key OBJECT_NAME.parquet
 --expression "select count(0) from s3object where int(_1)<10;" output.csv
Note: The only supported output serialization is .csv.
Example (for Parquet)
aws --endpoint-url http://localhost:80 s3api select-object-content
 --bucket testbucket
 --expression-type 'SQL'
 --input-serialization
 '{"Parquet":{}, "CompressionType": "NONE"}'
 --output-serialization '{"CSV": {}}}'
 --key testobject.parquet
 --expression "select count(0) from s3object where int(_1)<10;" output.csv
Syntax (for JSON)
aws --endpoint-url http://localhost:80 s3api select-object-content
 --bucket BUCKET_NAME
 --key OBJECT_NAME.json
 --expression-type 'SQL'
 --input-serialization
 '{"JSON": {"Type": "DOCUMENT"}, "CompressionType": "NONE"}'
 --output-serialization '{"CSV": {}}'
 --expression "select count(*) from S3Object[*];" /dev/stdout
Example (for JSON)
aws s3api --endpoint-url http://localhost:80
select-object-content --bucket test
--key testobject.json
--expression-type 'SQL'
--input-serialization
'{"JSON": {"Type": "DOCUMENT"}, "CompressionType": "NONE"}'
--output-serialization '{"CSV": {}}'
--expression "select count(*) from S3Object[*];" /dev/stdout

Example (for BOTO3)

import pprint
import boto3
from botocore.exceptions import ClientError

def run_s3select(bucket,key,query,column_delim=",",row_delim="\n",quot_char='"',esc_char='\\',csv_header_info="NONE"):

   s3 = boto3.client('s3',
       endpoint_url=endpoint,
       aws_access_key_id=access_key,
       region_name=region_name,
       aws_secret_access_key=secret_key)

   result = ""
   try:
       r = s3.select_object_content(
       Bucket=bucket,
       Key=key,
       ExpressionType='SQL',
       InputSerialization = {"CSV": {"RecordDelimiter" : row_delim, "FieldDelimiter" : column_delim,"QuoteEscapeCharacter": esc_char, "QuoteCharacter": quot_char, "FileHeaderInfo": csv_header_info}, "CompressionType": "NONE"},
       OutputSerialization = {"CSV": {}},
       Expression=query,
       RequestProgress = {"Enabled": progress})

   except ClientError as c:
       result += str(c)
       return result

   for event in r['Payload']:
           if 'Records' in event:
               result = ""
               records = event['Records']['Payload'].decode('utf-8')
               result += records
           if 'Progress' in event:
               print("progress")
               pprint.pprint(event['Progress'],width=1)
           if 'Stats' in event:
               print("Stats")
               pprint.pprint(event['Stats'],width=1)
           if 'End' in event:
               print("End")
               pprint.pprint(event['End'],width=1)

   return result




 run_s3select(
 "my_bucket",
 "my_csv_object",
 "select int(_1) as a1, int(_2) as a2 , (a1+a2) as a3 from s3object where a3>100 and a3<300;")

Supported features

Currently, only part of the AWS s3 select command is supported. For a full list of supproted features, see Table 1.

Table 1. AWS s3 select command supported features
Features Details Description Example

Arithmetic operators

^ * % / + - ( )

select (int(_1)+int(_2))*int(_9) from s3object;

Arithmetic operators

% modulo

select count(*) from s3object where cast(_1 as int)%2 = 0;

Arithmetic operators

^ power-of

select cast(2^10 as int) from s3object;

Compare operators

> < >= <= = !=

select _1,_2 from s3object where (int(_1)+int(_3))>int(_5);

logical operator

AND OR NOT

select count(*) from s3object where not (int(1)>123 and int(_5)<200);

logical operator

is null

Returns true/false for null indication in expression

logical operator and NULL

is not null

Returns true/false for null indication in expression

logical operator and NULL

unknown state

Review null-handle and observe the results of logical operations with NULL. The query returns 0.

select count(*) from s3object where null and (3>2);

Arithmetic operator with NULL

unknown state

Review null-handle and observe the results of binary operations with NULL. The query returns 0.

select count(*) from s3object where (null+1) and (3>2);

Compare with NULL

unknown state

Review null-handle and observe results of compare operations with NULL. The query returns 0.

select count(*) from s3object where (null*1.5) != 3;

missing column

unknown state

select count(*) from s3object where _1 is null;

projection column

Similar to if or then or else

select case when (1+1=(2+1)*3) then ‘case_1’ when ((4*3)=(12)) then ‘case_2’ else ‘case_else’ end, age*2 from s3object;

projection column

similar to switch/case default

select case cast(_1 as int) + 1 when 2 then “a” when 3 then “b” else “c” end from s3object;

logical operator

coalesce returns first non-null argument

select coalesce(nullif(5,5),nullif(1,1.0),age+12) from s3object;

logical operator

nullif returns null in case both arguments are equal, or else the first one,nullif(1,1)=NULL nullif(null,1)=NULL nullif(2,1)=2

select nullif(cast(_1 as int),cast(_2 as int)) from s3object;

logical operator

{expression} in ( .. {expression} ..)

select count(*) from s3object where ‘ben’ in (trim(_5),substring(_1,char_length(_1)-3,3),last_name);

logical operator

{expression} between {expression} and {expression}

select _1 from s3object where cast(_1 as int) between 800 and 900;
select count(*) from stdin where substring(_3,char_length(_3),1) between “x” and trim(_1) and substring(_3,char_length(_3)-1,1) = “:”;

logical operator

{expression} like {match-pattern}

select count(*) from s3object where first_name like ‘%de_’; select count(*) from s3object where _1 like "%a[r-s]";

logical operator

{expression} like {match-pattern} escape {char} select count(*) from s3object where “jok_ai” like “%#_ai” escape “#”;

casting operator

select cast(123 as int)%2 from s3object;

casting operator

select cast(123.456 as float)%2 from s3object;

casting operator

select cast(‘ABC0-9’ as string),cast(substr(‘ab12cd’,3,2) as int)*4 from s3object;

casting operator

select cast(substring(‘publish on 2007-01-01’,12,10) as timestamp) from s3object;

non AWS casting operator

select int(_1),int( 1.2 + 3.4) from s3object;

non AWS casting operator

select float(1.2) from s3object;

non AWS casting operator

select to_timestamp('1999-10-10T12:23:44Z') from s3object;

Aggregation Function

sum

select sum(int(_1)) from s3object;

Aggregation Function

avg

select avg(cast(_1 as float) + cast(_2 as int)) from s3object;

Aggregation Function

min

select min( int(_1) * int(_5) ) from s3object;

Aggregation Function

max

select max(float(_1)),min(int(_5)) from s3object;

Aggregation Function

count

select count(*) from s3object where (int(1)+int(_3))>int(_5);

Timestamp Functions

extract

select count(*) from s3object where extract(year from to_timestamp(_2)) > 1950 and extract(year from to_timestamp(_1)) < 1960;

Timestamp Functions

dateadd

select count(0) from s3object where date_diff(year,to_timestamp(_1),date_add(day,366,to_timestamp(_1))) = 1;

Timestamp Functions

datediff

select count(0) from s3object where date_diff(month,to_timestamp(_1),to_timestamp(_2)) = 2;

Timestamp Functions

utcnow

select count(0) from s3object where date_diff(hour,utcnow(),date_add(day,1,utcnow())) = 24

Timestamp Functions

to_string

select to_string( to_timestamp(“2009-09-17T17:56:06.234567Z”), “yyyyMMdd-H:m:s”) from s3object;

String Functions

substring

select count(0) from s3object where int(substring(_1,1,4))>1950 and int(substring(_1,1,4))<1960;

substring with from negative number is valid considered as first select substring(“123456789” from -4) from s3object;
substring with from zero for out-of-bound number is valid just as first,last. select substring(“123456789” from 0 for 100) from s3object;

String Functions

trim

select trim(‘ foobar ‘) from s3object;

String Functions

trim

select trim(trailing from ‘ foobar ‘) from s3object;

String Functions

trim

select trim(leading from ‘ foobar ‘) from s3object;

String Functions

trim

select trim(both ‘12’ from ‘1112211foobar22211122’) from s3object;

String Functions

lower or upper

select lower(‘ABcD12#$e’) from s3object;

String Functions

char_length, character_length

select count(*) from s3object where char_length(_3)=3;

Complex queries

select sum(cast(_1 as int)),max(cast(_3 as int)), substring(‘abcdefghijklm’, (2-1)*3+sum(cast(_1 as int))/sum(cast(_1 as int))+1, (count() + count(0))/count(0)) from s3object;

alias support

select int(_1) as a1, int(_2) as a2 , (a1+a2) as a3 from s3object where a3>100 and a3<300;