机器学习与数据挖掘玩转大数据

SQL结构化查询语言

2017-07-07  本文已影响0人  JasonJe

SQL Structured Query Language 结构化查询语言

1.1 Manipulation 数据操纵

SELECT * FROM celebs; 提取celebs表中的全部数据

integer 整数类型
text 文本字符串类型
date 日期类型
real 实数类型

CREATE TABLE table_name (
    column_1 data_type, 
    column_2 data_type, 
    column_3 data_type
  );
INSERT INTO celebs (id, name, age)
VALUES (1, 'Justin Bieber', 21);
SELECT name FROM celebs;
UPDATE celebs
SET age = 22
WHERE id = 1;

增加一列数据定义

ALTER TABLE celebs ADD COLUMN twitter_handle TEXT;
DELETE FROM celebs WHERE twitter_handle IS NULL;

1.2 Queries 数据查询

SELECT name, imdb_rating FROM movies;

【DISINCT 操作符 保证查询结果为唯一值】

SELECT DISTINCT genre FROM movies;

【WHERE 操作符 过滤查询/条件查询】

SELECT * FROM movies WHERE imdb_rating > 8;

WHERE 条件运算符有
= 等于
!= 不等于
> 大于
< 小于
>= 大于等于
<= 小于等于

【LIKE 操作符 模糊查询】

SELECT * FROM movies
WHERE name LIKE 'Se_en';
SELECT * FROM movies
WHERE name LIKE 'a%';

SELECT * FROM movies
WHERE name LIKE '%man%'

LIKE 通配符
_ 通配单个字符
% 通配零个或多个字符

A% 匹配所有开头为'A'的字符
%a 匹配所有结尾为'a'的字符

【BETWEEN ... AND ... 操作符 匹配范围内的记录】

SELECT * FROM movies
WHERE name BETWEEN 'A' AND 'J';
SELECT * FROM movies
WHERE year BETWEEN 1990 AND 2000
AND genre = 'comedy';

【AND & OR 操作符 逻辑过滤】

如果第一个条件和第二个条件都成立,则 AND 运算符显示一条记录
如果第一个条件和第二个条件中只要有一个成立,则 OR 运算符显示一条记录

SELECT * FROM movies
WHERE genre = 'comedy'
OR year < 1980;

【ORDER BY DESC/ASC 操作符 排序】

ASC 升序排序
DESC 降序排序

SELECT * FROM movies
ORDER BY imdb_rating DESC;

【LIMIT 操作符 限制记录数量】

SELECT * FROM movies
ORDER BY imdb_rating ASC
LIMIT 3;

1.3 Aggregate Functions 聚合函数

【COUNT(...) 计数函数】

SELECT COUNT(*) FROM fake_apps;

【GROUP BY 分组函数】

SELECT price, COUNT(*) FROM fake_apps
GROUP BY price;
SELECT price, COUNT(*) FROM fake_apps
WHERE downloads > 20000
GROUP BY price;

【SUM(...) 求和函数】

SELECT SUM(downloads) FROM fake_apps;
SELECT category, SUM(downloads) FROM fake_apps
GROUP BY category;

【MAX(...) 最大值函数】

SELECT MAX(downloads) FROM fake_apps;
SELECT name, category, MAX(downloads) FROM fake_apps
GROUP BY category;

【MIN(...) 最小值函数】

SELECT MIN(downloads) FROM fake_apps;

【AVG(...) 平均函数】

SELECT AVG(downloads) FROM fake_apps;
SELECT price, AVG(downloads) FROM fake_apps
GROUP BY price;

【ROUND(...) 舍入为指定小数位数函数】

SELECT price, ROUND(AVG(downloads), 2) FROM fake_apps
GROUP BY price;

1.4 Multiple Tables 多表

【PRIMARY KEY 主键】

定义为主键的一列不为NULL值,都是唯一值

CREATE TABLE artists(id INTEGER PRIMARY KEY, name TEXT)

【FOREIGN KEY 外键】

SELECT albums.name, albums.year, artists.name FROM albums, artists;

【INNER JOIN 内连接】

SELECT
*
FROM
albums
JOIN artists ON
albums.artist_id = artists.id;

【LEFT JOIN 左连接】

SELECT
*
FROM
albums
LEFT JOIN artists ON
albums.artist_id = artists.id;

【AS 关键字 重命名列名】

SELECT
albums.name AS 'Album',
albums.year,
artists.name AS 'Artist'
FROM
albums
JOIN artists ON
albums.artist_id = artists.id
WHERE
albums.year > 1980;

1.5 Subqueries 子查询

【嵌套查询】

SELECT *
FROM flights
WHERE origin in (
SELECT code
FROM airports
WHERE elevation < 2000);
SELECT a.dep_month,
   a.dep_day_of_week,
   AVG(a.flight_distance) AS average_distance
FROM (
    SELECT dep_month,
          dep_day_of_week,
           dep_date,
           sum(distance) AS flight_distance
      FROM flights
     GROUP BY 1,2,3
   ) a
GROUP BY 1,2
ORDER BY 1,2;

【外部查询取出一行后,再对内部查询中的每一行内进行运算,TRUE时候返回查询结果】

SELECT id
FROM flights AS f
WHERE distance < (
SELECT AVG(distance)
FROM flights
WHERE carrier = f.carrier
);

1.6 Set Operations

【UNION 并集】

SELECT column_name(s) FROM table1
UNION
SELECT column_name(s) FROM table2;

UNION ALL会列出所有的值,即不去重

【INTERSECT 交集】

SELECT category FROM new_products
INTERSECT
SELECT category FROM legacy_products;

【EXCEPT 差集】

SELECT category FROM new_products
EXCEPT
SELECT category FROM legacy_products;

1.7 Conditional Aggregates 条件聚合

IS NULL 值为空
IS NOT NULL 值不为空

【CASE 控制结构】

SELECT
CASE
    WHEN elevation < 250 THEN 'Low'
    WHEN elevation BETWEEN 250 AND 1749 THEN 'Medium'
    WHEN elevation >= 1750 THEN 'High'
    ELSE 'Unknown'
END AS elevation_tier
, COUNT(*)
FROM airports
GROUP BY 1;

【COUNT(CASE WHEN)】

SELECT    state, 
COUNT(CASE WHEN elevation >= 2000 THEN 1 ELSE NULL END) as count_high_elevation_aiports 
FROM airports 
GROUP BY state;

【SUM(CASE WHEN)】

SELECT origin, sum(distance) as total_flight_distance, sum(CASE WHEN carrier = 'DL' THEN distance ELSE 0 END) as total_delta_flight_distance 
FROM flights 
GROUP BY origin;
SELECT origin, 100.0*(sum(CASE WHEN carrier = 'UN' THEN distance ELSE 0 END)/sum(distance)) as percentage_flight_distance_from_united 
FROM flights 
GROUP BY origin;

SELECT state,
100.0 * sum(CASE WHEN elevation >= 2000 THEN 1 ELSE 0 END) / count(*)  as percentage_high_elevation_airports
FROM airports
GROUP BY state;

1.8 Date, Number, and String Functions 日期,数字和字符串函数

【Dates 时间函数】

SELECT DATETIME(manufacture_time)
FROM baked_goods;
SELECT DATE(manufacture_time), count(*) as count_baked_goods
FROM baked_goods
GROUP BY DATE(manufacture_time);
SELECT TIME(manufacture_time), count(*) as count_baked_goods
FROM baked_goods
GROUP BY TIME(manufacture_time);

在当前时间进行时间的增加,下述例子在当前时间上增加了1天2小时30分钟

SELECT DATETIME(manufacture_time, '+2 hours', '30 minutes', '1 day') as inspection_time
FROM baked_goods;

【Numbers 数字函数】

SELECT ROUND(ingredients_cost, 4) as rounded_cost
FROM baked_goods;

SELECT (number1 + number2) 返回两个数字的和
SELECT CAST(number1 AS REAL) / number3 CAST()函数用于将某种数据类型的表达式显式转换为另一种数据类型
SELECT ROUND(number, precision) 四舍五入数字,precision为精度

返回两列数据中同行的最小值

SELECT id, MIN(cook_time, cool_down_time)
FROM baked_goods;

【|| 字符串连接运算】

SELECT string1 || ' ' || string2; 利用"||"连接string1和一个空格和string2

【REPLACE() 字符串替换运算】

REPLACE(string,from_string,to_string) 将string列中的"from_string"字符串替换为"to_string"字符串

上一篇下一篇

猜你喜欢

热点阅读