Install Command-line Client
$ sudo apt-get install apt-transport-https ca-certificates dirmngr
$ sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4
$ echo "deb https://repo.clickhouse.tech/deb/stable/ main/" | sudo tee \
/etc/apt/sources.list.d/clickhouse.list
$ sudo apt-get update
$ sudo apt-get install -y clickhouse-server clickhouse-client
$ sudo service clickhouse-server start
$ clickhouse-client
External Data
- External Data for Query Processing ClickHouse 可以在查询(External tables could be sent only with select query)的时候加载外部数据,例如使用 Command-line Client 的时候可以使用如下参数
$ clickhouse-client --help
...
External tables options:
--file arg data file or - for stdin
--name arg (=_data) name of the table
--format arg (=TabSeparated) data format
--structure arg structure
--types arg types
# Usage
# --external Marks the beginning of a clause.
# -–file Path to the file with the table dump, or `-`, which refers to stdin. Only a single table can be retrieved from stdin.
# 以下的参数是可选的
# –-name Name of the table. If omitted, `_data` is used. (Optional)
# –-format Data format in the file. If omitted, `TabSeparated` is used. (Optional)
# 下面两个参数必须指定一个
# -–types A list of comma-separated column types. For example: UInt64,String. The columns will be named _1, _2, …
# -–structure The table structure in the format UserID UInt64, URL String. Defines the column names and types.
$ clickhouse-client \
--external --file=... [--name=...] [--format=...] [--types=...|--structure=...]
# examples
$ clickhouse-client --host 172.16.105.16 --port 31234 \
--user admin --password xxxxxxxx \
--query "SELECT * FROM external_lineorder" \
--external \
--file=lineorder.tbl \
--name=external_lineorder \
–-format=CSV \
--structure="LO_ORDERKEY UInt32, LO_LINENUMBER UInt8, LO_CUSTKEY UInt32, LO_PARTKEY UInt32, LO_SUPPKEY UInt32, LO_ORDERDATE Date, LO_ORDERPRIORITY LowCardinality(String), LO_SHIPPRIORITY UInt8, LO_QUANTITY UInt8, LO_EXTENDEDPRICE UInt32, LO_ORDTOTALPRICE UInt32, LO_DISCOUNT UInt8, LO_REVENUE UInt32, LO_SUPPLYCOST UInt32, LO_TAX UInt8, LO_COMMITDATE Date, LO_SHIPMODE LowCardinality(String)"
对于
--format
,详细的可以参考Formats for Input and Output Data
Operation
System Tables
# 查看集群 SQL 执行队列
SELECT query_id, user, address, query FROM system.processes ORDER BY query_id ASC
Query id: f148aae4-f0e3-460b-9481-65371ca13113
┌─query_id─────────────────────────────┬─user──┬─address────────────┬─query─────────────────────────────────────────────────────────────────────────────┐
│ f148aae4-f0e3-460b-9481-65371ca13113 │ admin │ ::ffff:10.240.1.27 │ SELECT query_id, user, address, query FROM system.processes ORDER BY query_id ASC │
└──────────────────────────────────────┴───────┴────────────────────┴───────────────────────────────────────────────────────────────────────────────────┘
1 rows in set. Elapsed: 0.003 sec.
# 终止某个 SQL 的执行
KILL QUERY WHERE query_id = 'query_id'
# 查询集群事件
SELECT * FROM system.events
Query id: 3dc8e9f7-852a-4917-846e-eb8220b403c2
┌─event───────────────────────────────────┬────────value─┬─description──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
──────────────────────────────────────────────────────────────────────────────┐
│ Query │ 257 │ Number of queries to be interpreted and potentially executed. Does not include queries that failed to parse or were rejected due to AST size limits, quota limits or limits o
n the number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries. │
│ SelectQuery │ 150 │ Same as Query, but only for SELECT queries.
│
│ InsertQuery │ 76 │ Same as Query, but only for INSERT queries.
...
77 rows in set. Elapsed: 0.003 sec.
Practices
Execute sql file
# 初始化表
clickhouse-client --host 10.53.26.177 --port 31234 -u admin --password xxxxxx --multiquery < init_table.sql
cluster="hadoop"
prefix="bj"
clickhouse-client --host 10.53.26.177 --port 31234 -u admin --password xxxxxx --query "truncate table dlink.${prefix}_all_file_uid"
# 遍历文件夹中的文件
for FILENAME in /data/analysis/$cluster/user/sre.bigdata/all_file_uniqueId.parquet/*.parquet; do
# Log Engine 写入的时候整个表会锁住,任何读取操作会被阻塞,当没有写操作的时候支持并发读
cat $FILENAME | clickhouse-client \
--host 10.53.26.177 --port 31234 -u admin --password xxxxxx \
--query "INSERT INTO dlink.${prefix}_all_file_uid FORMAT Parquet" \
--max_insert_block_size=100000
done
clickhouse-client --host 10.53.26.177 --port 31234 -u admin --password xxxxxx \
--query "INSERT INTO dlink.file_uid_info(md5, filePath, fileSize, cluster, isInnerAvro) SELECT md5, filePath, fileSize, '$cluster', 0 FROM dlink.${prefix}_all_file_uid"
clickhouse-client --host 10.53.26.177 --port 31234 -u admin --password xxxxxx --query "drop table dlink.${prefix}_all_file_uid"
Import Data
Import TSV
# Connect to clickhouse container
$ kubectl.exe exec -it clickhouse-5 -- bash
# Connect to clickhouse cli
$ /usr/bin/clickhouse-client --host clickhouse-5.clickhouse -u admin --password xxxxxxxx
CREATE DATABASE dlink ON CLUSTER cat
CREATE TABLE dlink.avro_schemas_shard ON CLUSTER cat
(
id UInt32,
cluster LowCardinality(String),
filePath String,
schema Nullable(String),
fileNums Int32
)
ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard_cat}/dlink.avro_schemas_shard', '{replica_cat}')
ORDER BY (id)
CREATE TABLE dlink.avro_schemas ON CLUSTER cat AS dlink.avro_schemas_shard
ENGINE = Distributed('cat', 'dlink', 'avro_schemas_shard', rand())
# Import tsv data
$ nohup /usr/bin/clickhouse-client --host clickhouse-5.clickhouse -u admin --password xxxxxxxx --query 'INSERT INTO dlink.avro_schemas FORMAT TabSeparated' --max_insert_block_size=100000 < /tmp/hdfs_avro_schemas.txt >import.log 2>&1 &
Import Data from HDFS
-- 关联 hdfs 目录
CREATE TABLE dlink.hdfs_bj_avro_inner_file_uid
(
md5 String,
filePath String,
fileSize Nullable(UInt64),
rowNumber Nullable(UInt32),
bytesFieldName Nullable(String)
)
ENGINE = HDFS('hdfs://sensetime-data-hadoop/user/sre.bigdata/avro_file_uniqueId_parquet', 'Parquet')
CREATE TABLE dlink.hdfs_bj_all_file_uid
(
md5 String,
filePath String,
fileSize Nullable(UInt64)
)
ENGINE = HDFS('hdfs://sensetime-data-hadoop/user/sre.bigdata/all_file_uniqueId.parquet', 'Parquet')
-- 创建本地表
CREATE TABLE IF NOT EXISTS dlink.file_uid_info_shard ON CLUSTER cat
(
md5 String,
cluster LowCardinality(String),
isInnerAvro UInt8, -- Boolean Type
filePath String,
fileSize Nullable(UInt64),
rowNumber Nullable(UInt32),
bytesFieldName Nullable(String)
)
ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard_cat}/dlink.file_uid_info_shard', '{replica_cat}')
ORDER BY (md5)
-- 创建分布式表
CREATE TABLE dlink.file_uid_info ON CLUSTER cat AS dlink.file_uid_info_shard
ENGINE = Distributed('cat', 'dlink', 'file_uid_info_shard', rand())
-- 导入本地表
INSERT INTO dlink.file_uid_info (md5, filePath, fileSize, cluster, isInnerAvro) SELECT *, 'hadoop', 1 FROM dlink.hdfs_bj_avro_inner_file_uid;
INSERT INTO dlink.file_uid_info SELECT *, "hadoop" as cluster, 0 as isInnerAvro FROM dlink.hdfs_bj_all_file_uid;
Import Data from File
- Table Engines - File
CREATE TABLE dlink.bj_avro_inner_file_uid ( md5 String, filePath String, fileSize Nullable(UInt64), rowNumber Nullable(UInt32), bytesFieldName Nullable(String) ) ENGINE = File(TabSeparated)
- Table Functions - file
Note: file 只能读到 user_files_path 下的文件
INSERT INTO dlink.file_uid_info (md5, filePath, fileSize, cluster, isInnerAvro) SELECT md5, filePath, fileSize, 'hadoop', 1 FROM file( "/var/lib/clickhouse/user_files/sre.bigdata/all_file_uniqueId.parquet/*", "PARQUET", "md5 String, filePath String, fileSize Nullable(UInt64)" )
查询数据库和表容量
-- 查看数据库容量
select
sum(rows) as "总行数",
formatReadableSize(sum(data_uncompressed_bytes)) as "原始大小",
formatReadableSize(sum(data_compressed_bytes)) as "压缩大小",
round(sum(data_compressed_bytes) / sum(data_uncompressed_bytes) * 100, 0) "压缩率"
from system.parts;
-- 查询 test 表,2019年10月份的数据容量
select
table as "表名",
sum(rows) as "总行数",
formatReadableSize(sum(data_uncompressed_bytes)) as "原始大小",
formatReadableSize(sum(data_compressed_bytes)) as "压缩大小",
round(sum(data_compressed_bytes) / sum(data_uncompressed_bytes) * 100, 0) "压缩率"
from system.parts
-- 根据实际情况加查询条件
where table in('test')
and partition like '2019-10-%'
group by table;
查看和删除任务
-- 这个命令和mysql是一样的
show processlist;
-- 如果进程太多,也可用通过查询系统表 processes,
select * from system.processes;
-- 指定主要关心字段
select
user,query_id,query,elapsed,memory_usage
from system.processes;
-- 通过上面指令获取到进程相关信息后,可以用query_id条件kill进程
KILL QUERY WHERE query_id='2-857d-4a57-9ee0-327da5d60a90'
-- 杀死default用户下的所有进程
KILL QUERY WHERE user='default'