大數據期末考

第一題 -1

第一題 -2

第一題 -3

第一題 -4

第二題 -1

第二題 -2

第二題 -3

爬取網頁 - 下載題目程式碼 area_populations.csv 和 populations1.csv

require pandas 、 lxml

import pandas as pd

url = 'https://zh.wikipedia.org/zh-tw/%E8%87%BA%E7%81%A3%E8%A1%8C%E6%94%BF%E5%8D%80%E4%BA%BA%E5%8F%A3%E5%88%97%E8%A1%A8'
tables = pd.read_html(url, header=0, keep_default_na=False)

#print(tables[0].columns)
df = tables[0].drop([' 排名 ',' 英語 ', ' 分類 ', ' 縣市旗 ', '2021 年至 2022 年 人口消長 ', ' 政府所在地 ', ' 資料來源 '], axis = 1)
df.columns = [' 行政區 ',' 面積 ',' 人口 ']
print(df.head())
df.to_csv("area_populations.csv",index=False, encoding='UTF8')
#print(type(tables[1]))
#print(tables[1].columns)
df1 = tables[1].drop([' 總排名 ', ' 直轄市排名 ','2021 年至 2022 年 人口消長 ', ' 資料來源 '], axis = 1) #Remove the Specific Column from DataFrame
df1.columns = [' 區 ', ' 市 ', ' 人口 ']
print(df1.head())
df1.to_csv("populations1.csv",index=False, encoding='UTF8')

第一題 -1

import pandas as pd

# 讀取 CSV 文件
df = pd.read_csv('area_populations.csv')

# 計算人口密度並新增一個欄位 &#8203;``oaicite:{"number":1,"invalid_reason":"Malformed citation 【人口密度】"}``&#8203;
df[' 人口密度 '] = df[' 人口 '] / df[' 面積 ']

# 排序並輸出人口密度最高的五個縣市
top_5_cities = df.sort_values(by=' 人口密度 ', ascending=False).head(5)
print("Top 5 cities with the highest population density:")
print(top_5_cities)

# 排序並輸出人口密度最低的五個縣市
bottom_5_cities = df.sort_values(by=' 人口密度 ', ascending=True).head(5)
print("\nTop 5 cities with the lowest population density:")
print(bottom_5_cities)

第一題 -2

import pandas as pd

# 讀取 CSV 文件 
df = pd.read_csv('populations1.csv')

# 使用 groupby() 函數統計各市的人口總數
city_populations = df.groupby(' 市 ')[' 人口 '].sum()

# 找出人口最多的三個市
top_3_cities = city_populations.nlargest(3)
print("Top 3 cities with the highest population:")
print(top_3_cities, end="")

# 找出人口最少的三個市
bottom_3_cities = city_populations.nsmallest(3)
print("\nTop 3 cities with the lowest population:")
print(bottom_3_cities)

第一題 -3

import pandas as pd

# 讀取 CSV 文件
df = pd.read_csv('populations1.csv')

# 找出台北市人口最多的三個區
taipei_top_3_districts = df[df[' 市 '] == ' 臺北市 '].nlargest(3, ' 人口 ')
print("Top 3 districts with the highest population in Taipei City:")
print(taipei_top_3_districts[[' 區 ', ' 人口 ']])

# 找出高雄市人口最多的三個區
kaohsiung_top_3_districts = df[df[' 市 '] == ' 高雄市 '].nlargest(3, ' 人口 ')
print("\nTop 3 districts with the highest population in Kaohsiung City:")
print(kaohsiung_top_3_districts[[' 區 ', ' 人口 ']])

第一題 -4

import pandas as pd

# 讀取 CSV 文件
df = pd.read_csv('populations1.csv')

# 找出台北市人口介於 100,000~200,000 的區
taipei_districts = df[(df[' 市 '] == ' 臺北市 ') & (df[' 人口 '] >= 100000) & (df[' 人口 '] <= 200000)]
print("Districts in Taipei City with population between 100,000 and 200,000:")
print(taipei_districts[[' 區 ', ' 人口 ']])

# 找出高雄市人口介於 100,000~200,000 的區
kaohsiung_districts = df[(df[' 市 '] == ' 高雄市 ') & (df[' 人口 '] >= 100000) & (df[' 人口 '] <= 200000)]
print("Districts in Kaohsiung City with population between 100,000 and 200,000:")
print(kaohsiung_districts[[' 區 ', ' 人口 ']])

第二大題與期中考相似，可參考期中考
利用 def 函數將整個期中考 model 模組化並呼叫兩次，利用隨意變數存取回傳值並輸出答案

大數據期中考

1
2
3

list_1=[18, 58, 14, 31, 72, 23, 12, 59, 94, 71, 41, 92, 30, 66, 99, 37, 48, 100, 3, 67, 42, 28, 0, 74, 94,19, 64, 70, 88, 98, 6, 16, 1, 34, 85, 67, 62, 38, 94, 36, 91, 70, 87, 4, 41, 100, 42, 11, 17, 77, 51, 6, 1, 5, 0, 47,2, 56, 29, 95, 84, 86, 80, 7, 23, 93, 87, 20, 85, 78, 89, 14, 96, 9, 76, 37, 48, 27, 77, 4, 33, 79, 84, 29, 68, 32,81, 87, 10, 8, 90, 48, 95, 64, 15, 38, 60, 92, 36, 73]
list_2=[19, 51, 38, 2, 15, 32, 14, 75, 92, 93, 22, 32, 15, 60, 61, 53, 83, 41, 2, 0, 85, 1, 60, 26, 30, 10,36, 13, 12, 92, 92, 34, 47, 46, 27, 82, 84, 98, 84, 74, 90, 53, 93, 69, 3, 25, 49, 68, 72, 2, 79, 40, 58, 12, 33, 14,74, 26, 52, 34, 39, 21, 94, 41, 90, 79, 71, 46, 78, 22, 9, 51, 59, 81, 52, 87, 68, 20, 67, 80, 54, 35, 44, 98, 30,35, 27, 13, 57, 50, 89, 96, 56, 80, 63, 73, 49, 42, 34, 56]
#print()

第二題 -1

list_1=[18, 58, 14, 31, 72, 23, 12, 59, 94, 71, 41, 92, 30, 66, 99, 37, 48, 100, 3, 67, 42, 28, 0, 74, 94,19, 64, 70, 88, 98, 6, 16, 1, 34, 85, 67, 62, 38, 94, 36, 91, 70, 87, 4, 41, 100, 42, 11, 17, 77, 51, 6, 1, 5, 0, 47,2, 56, 29, 95, 84, 86, 80, 7, 23, 93, 87, 20, 85, 78, 89, 14, 96, 9, 76, 37, 48, 27, 77, 4, 33, 79, 84, 29, 68, 32,81, 87, 10, 8, 90, 48, 95, 64, 15, 38, 60, 92, 36, 73]
list_2=[19, 51, 38, 2, 15, 32, 14, 75, 92, 93, 22, 32, 15, 60, 61, 53, 83, 41, 2, 0, 85, 1, 60, 26, 30, 10,36, 13, 12, 92, 92, 34, 47, 46, 27, 82, 84, 98, 84, 74, 90, 53, 93, 69, 3, 25, 49, 68, 72, 2, 79, 40, 58, 12, 33, 14,74, 26, 52, 34, 39, 21, 94, 41, 90, 79, 71, 46, 78, 22, 9, 51, 59, 81, 52, 87, 68, 20, 67, 80, 54, 35, 44, 98, 30,35, 27, 13, 57, 50, 89, 96, 56, 80, 63, 73, 49, 42, 34, 56]
set_1=set(list_1)
set_2=set(list_2)
print(set_1)
print(len(set_1))
print(set_2)
print(len(set_2))

第二題 -2

from math import sqrt

def calculate(the_list):
    length=len(the_list)
    # 平均
    averge=0
    temp=0

    for i in range (0,length,1):
        temp=temp+the_list[i]
    averge=temp/length

    # 中位數
    if length%2==0:
        mid=(the_list[length//2-1]+the_list[length//2])/2
    else:
        mid=(the_list[length//2])

    # 眾數
    ans = []
    count = []
    mode=[]
    ans.append(the_list[0])
    count.append(0)
    j = 0
    count[j] += 1
    max_num = 0
    for i in range(1, length, 1):
        if the_list[i] == the_list[i - 1]:
            count[j] += 1
        else:
            ans.append(the_list[i])
            j += 1
            count.append(0)
            count[j] += 1
    for i in range(0,len(count),1):
        if max_num < count[i]:
            max_num = count[i]

    for i in range(0, j + 1, 1):
        if count[i] == max_num:
            mode.append(ans[i])

    # 標準差
    sum_square=0
    for i in range(0,length,1):
        sum_square=sum_square+(the_list[i]-averge)**2

    sum_square=sum_square/length
    sum_square=sqrt(sum_square)
    stdev=(sum_square)
    return averge,mid,mode,stdev

list_1=[18, 58, 14, 31, 72, 23, 12, 59, 94, 71, 41, 92, 30, 66, 99, 37, 48, 100, 3, 67, 42, 28, 0, 74, 94,19, 64, 70, 88, 98, 6, 16, 1, 34, 85, 67, 62, 38, 94, 36, 91, 70, 87, 4, 41, 100, 42, 11, 17, 77, 51, 6, 1, 5, 0, 47,2, 56, 29, 95, 84, 86, 80, 7, 23, 93, 87, 20, 85, 78, 89, 14, 96, 9, 76, 37, 48, 27, 77, 4, 33, 79, 84, 29, 68, 32,81, 87, 10, 8, 90, 48, 95, 64, 15, 38, 60, 92, 36, 73]
list_2=[19, 51, 38, 2, 15, 32, 14, 75, 92, 93, 22, 32, 15, 60, 61, 53, 83, 41, 2, 0, 85, 1, 60, 26, 30, 10,36, 13, 12, 92, 92, 34, 47, 46, 27, 82, 84, 98, 84, 74, 90, 53, 93, 69, 3, 25, 49, 68, 72, 2, 79, 40, 58, 12, 33, 14,74, 26, 52, 34, 39, 21, 94, 41, 90, 79, 71, 46, 78, 22, 9, 51, 59, 81, 52, 87, 68, 20, 67, 80, 54, 35, 44, 98, 30,35, 27, 13, 57, 50, 89, 96, 56, 80, 63, 73, 49, 42, 34, 56]
list_1=set(list_1)
list_2=set(list_2)
list_1=list(list_1)
list_2=list(list_2)

mean_1, median_1, mode_1, stdev_1 = calculate(list_1)
mean_2, median_2, mode_2, stdev_2 = calculate(list_2)
print(mean_1)
print(median_1)
print(*mode_1,sep='')
print(stdev_1)
print(mean_2)
print(median_2)
print(*mode_2,sep='')
print(stdev_2)

第二題 -3

list_1=[18, 58, 14, 31, 72, 23, 12, 59, 94, 71, 41, 92, 30, 66, 99, 37, 48, 100, 3, 67, 42, 28, 0, 74, 94,19, 64, 70, 88, 98, 6, 16, 1, 34, 85, 67, 62, 38, 94, 36, 91, 70, 87, 4, 41, 100, 42, 11, 17, 77, 51, 6, 1, 5, 0, 47,2, 56, 29, 95, 84, 86, 80, 7, 23, 93, 87, 20, 85, 78, 89, 14, 96, 9, 76, 37, 48, 27, 77, 4, 33, 79, 84, 29, 68, 32,81, 87, 10, 8, 90, 48, 95, 64, 15, 38, 60, 92, 36, 73]
list_2=[19, 51, 38, 2, 15, 32, 14, 75, 92, 93, 22, 32, 15, 60, 61, 53, 83, 41, 2, 0, 85, 1, 60, 26, 30, 10,36, 13, 12, 92, 92, 34, 47, 46, 27, 82, 84, 98, 84, 74, 90, 53, 93, 69, 3, 25, 49, 68, 72, 2, 79, 40, 58, 12, 33, 14,74, 26, 52, 34, 39, 21, 94, 41, 90, 79, 71, 46, 78, 22, 9, 51, 59, 81, 52, 87, 68, 20, 67, 80, 54, 35, 44, 98, 30,35, 27, 13, 57, 50, 89, 96, 56, 80, 63, 73, 49, 42, 34, 56]
set_1=set(list_1)
set_2=set(list_2)

# 交集（Intersection）
intersection = set_1 & set_2

# 聯集（Union）
union = set_1 | set_2

# 對稱差集（Symmetric Difference）
sym_diff = set_1 ^ set_2

print(intersection)
print(len(intersection))
print(union)
print(len(union))
print(sym_diff)
print(len(sym_diff))

作者: 微風