选取文件
更新时间:2023-05-18
SelectObject接口支持用户对BOS中指定格式(CSV/JSON/Parquet)的object内容执行SQL语句,通过SQL这种结构化查询语言对object内容进行筛选、分析、过滤之后再返回用户需要的文件内容。请参考选取Object
使用限制见“开发者指南“-”使用及管理数据”-“选取object”部分。
查询csv文件
示例:
from baidubce import compat
import base64
# 方便理解,我们先上传一个简单的csv文件
csv_content = """
1,Maurits,2017-09-1216:32:57,685856330,-540265154.48,true
2,Iago,2018-02-01 12:25:01,-642946677,3781354659.89,false
3,Dionisio,2018-02-16 09:52:24,-3823711977,79336720.77,false
4,Aleen,2018-05-17 11:48:45,-3289131518,1499686289.41,false
5,Herschel,2019-06-04 02:28:37,3456163349,-3810272511.88,true
"""
bos_client.put_object_from_string(bucket_name, key, csv_content)
# 设置select_object()接口的参数
select_object_args = {
"expressionType": "SQL",
"inputSerialization": {
"compressionType": "NONE",
"csv": {
"fileHeaderInfo": "NONE",
"recordDelimiter": "Cg==",
"fieldDelimiter": "LA==",
"quoteCharacter": "Ig==",
"commentCharacter": "Iw=="
}
},
"outputSerialization": {
"outputHeader": False,
"csv": {
"quoteFields": "ALWAYS",
"recordDelimiter": "Cg==",
"fieldDelimiter": "LA==",
"quoteCharacter": "Ig=="
}
},
"requestProgress": {
"enabled": True
}
}
# 设置查询的sql语句,需要经过base64编码的sql语句
sql_exp = "SELECT _1, _2, _6 FROM BosObject"
select_object_args["expression"] = compat.convert_to_string(base64.standard_b64encode(compat.convert_to_bytes(sql_exp)))
# 调用select object接口
select_response = bos_client.select_object(bucket_name, key, select_object_args)
# 获取返回结果的生成器
result = select_response.result()
for msg in result:
print(msg)
if msg.headers["message-type"] == "Records":
print("type: {}, heades: {}, payload: {}, crc: {}".format(msg.type, msg.headers, msg.payload, msg.crc))
elif msg.headers["message-type"] == "Cont":
print("type: {}, heades: {}, bytes_scanned: {}, bytes_returned: {}, crc: {}".format(msg.type, msg.headers,
msg.bytes_scanned, msg.bytes_returned, msg.crc))
else:
print("type: {}, heades: {}, crc: {}".format(msg.type, msg.headers, msg.crc))
查询json文件
示例:
from baidubce import compat
import base64
# 方便理解,我们先上传一个简单的json文件
json_content = """
{
"name": "Smith",
"age": 16,
"weight": 65.5,
"org": null,
"projects":
[
{"project_name":"project1", "completed":false},
{"project_name":"project2", "completed":true}
]
}
"""
bos_client.put_object_from_string(bucket_name, key, json_content)
# 设置select_object()接口的参数
select_object_args = {
"expressionType": "SQL",
"inputSerialization": {
"compressionType": "NONE",
"json": {
"type": "DOCUMENT"
}
},
"outputSerialization": {
"json": {
"recordDelimiter": "Cg=="
}
},
"requestProgress": {
"enabled": True
}
}
# 设置查询的sql语句,需要经过base64编码的sql语句
sql_exp = "select projects from BosObject where name='Smith'"
select_object_args["expression"] = compat.convert_to_string(base64.standard_b64encode(compat.convert_to_bytes(sql_exp)))
# 调用select object接口
select_response = bos_client.select_object(bucket_name, key, select_object_args)
# 获取返回结果的生成器
result = select_response.result()
for msg in result:
print(msg)
if msg.headers["message-type"] == "Records":
print("type: {}, heades: {}, payload: {}, crc: {}".format(msg.type, msg.headers, msg.payload, msg.crc))
elif msg.headers["message-type"] == "Cont":
print("type: {}, heades: {}, bytes_scanned: {}, bytes_returned: {}, crc: {}".format(msg.type, msg.headers,
msg.bytes_scanned, msg.bytes_returned, msg.crc))
else:
print("type: {}, heades: {}, crc: {}".format(msg.type, msg.headers, msg.crc))
查询Parquet文件
from baidubce import compat
import base64
# 我们先上传一个简单的Parquet文件
'''
parquet文件解析内容
{"Name":"StudentName","Age":20,"Id":0,"Weight":50,"Sex":true,"Day":19240,"Scores":{"computer":80,"math":90,"physics":90}}
{"Name":"StudentName","Age":21,"Id":1,"Weight":50.1,"Sex":false,"Day":19240,"Scores":{"computer":81,"math":91,"physics":91}}
{"Name":"StudentName","Age":22,"Id":2,"Weight":50.2,"Sex":true,"Day":19240,"Scores":{"computer":82,"math":92,"physics":92}}
{"Name":"StudentName","Age":23,"Id":3,"Weight":50.3,"Sex":false,"Day":19240,"Scores":{"computer":83,"math":93,"physics":90}}
{"Name":"StudentName","Age":24,"Id":4,"Weight":50.4,"Sex":true,"Day":19240,"Scores":{"computer":84,"math":94,"physics":91}}
{"Name":"StudentName","Age":20,"Id":5,"Weight":50.5,"Sex":false,"Day":19240,"Scores":{"computer":85,"math":90,"physics":92}}
{"Name":"StudentName","Age":21,"Id":6,"Weight":50.6,"Sex":true,"Day":19240,"Scores":{"computer":86,"math":91,"physics":90}}
{"Name":"StudentName","Age":22,"Id":7,"Weight":50.7,"Sex":false,"Day":19240,"Scores":{"computer":87,"math":92,"physics":91}}
{"Name":"StudentName","Age":23,"Id":8,"Weight":50.8,"Sex":true,"Day":19240,"Scores":{"computer":88,"math":93,"physics":92}}
{"Name":"StudentName","Age":24,"Id":9,"Weight":50.9,"Sex":false,"Day":19240,"Scores":{"computer":89,"math":94,"physics":90}}
'''
bos_client.put_object_from_file(bucket_name, key, parquet_file_name)
# 设置select_object()接口的参数
select_object_args = {
"expressionType": "SQL",
"inputSerialization": {
"compressionType": "NONE",
"parquet": {}
},
"outputSerialization": {
"json": {
"recordDelimiter": "Cg=="
}
},
"requestProgress": {
"enabled": false
}
}
# 设置查询的sql语句,需要经过base64编码的sql语句
sql_exp = "select * from BosObject s where s.Scores.computer > 85"
select_object_args["expression"] = compat.convert_to_string(base64.standard_b64encode(compat.convert_to_bytes(sql_exp)))
# 调用select object接口
select_response = bos_client.select_object(bucket_name, key, select_object_args)
# 获取返回结果的生成器
result = select_response.result()
for msg in result:
print(msg)
if msg.headers["message-type"] == "Records":
print("type: {}, heades: {}, payload: {}, crc: {}".format(msg.type, msg.headers, msg.payload, msg.crc))
elif msg.headers["message-type"] == "Cont":
print("type: {}, heades: {}, bytes_scanned: {}, bytes_returned: {}, crc: {}".format(msg.type, msg.headers,
msg.bytes_scanned, msg.bytes_returned, msg.crc))
else:
print("type: {}, heades: {}, crc: {}".format(msg.type, msg.headers, msg.crc))