～続・不動産とファイナンス・賃貸物件入居者編(2)～「機械学習を使って東京23区のお買い得賃貸物件を探してみた」を千葉県で再度やってみる（データクレンジング編）

今回は前回記事の続きで、SUUMOから取得してきたデータのクレンジングを行います。

d-s.hatenablog.com

なお、このテーマはshokosakaさんの記事が参考となっています。

www.analyze-world.com

参考記事ではPythonでデータクレンジング(前処理)を行っていますが、私はR で実施しています。

Pythonでも似たようなコードになると思うので、Pythonユーザーの方でも問題ないと思いますが、ご希望の方はコメントくださればPythonのコードも書いて追記します。

では、さっそく内容に入っていきます。

前回の記事でスクレイピングしてきたデータをクレンジングした結果を先に見ておきます。

以下のようになりました。

> t(head(suumo_data,3))

             1                                 2                             3           

name         "ホーリーグランド稲毛山王305号室" "ＪＲ総武線稲毛駅4階建築52年" "大宮町貸家"

rent1        "30000"                           "50000"                       "40000"     

rent2        "0"                               "3000"                        "0"         

shikikin     "30000"                           "0"                           "40000"     

reikin       "30000"                           "0"                           "0"         

hoshoukin    "0"                               "0"                           "0"         

layout       "1K"                              "1DK"                         "1DK"       

area         "21.06"                           "34.27"                       "33.12"     

direction    "東"                              "東"                          "北西"      

type         "マンション"                      "マンション"                  "一戸建て"  

age          "89"                              "52"                          "48"        

rout1        "ＪＲ総武線"                      "ＪＲ総武線"                 "ＪＲ総武線"

station      "稲毛"                            "稲毛"                        "千葉"      

distance     " 72"                             " 25"                         "100"       

construction "鉄骨"                            "鉄筋コン"                    "木造"      

floor        "3"                               "2"                           "1"         

height       "4"                               "4"                           "1"         

car.dum      "敷地内"                          "近隣"                        "付無料"    

free_rent    "0"                               "1"                           "0"

変数名は日本語をローマ字で充てたものと、簡単な英単語なので何かわかると思います。

補足をしておきますと以下の通りです。

area: 専有面積、age: 築年数、rout1: 最寄駅、distance: 最寄駅までの徒歩時間、

floor: その部屋が何階か、height: その物件は何階建か、car.dum: 駐車場有無、

free_rent: フリーレントの実施有無(期間は問わず)

以下が今回作成したRのコードです。

コードを解説しながら進めていこうと思ったのですが、コメントアウトを見ていただければそれなりにわかると思いますので、説明を省略してコードを載せてしまいます。

注意点としては、変数「distance(最寄り駅までの分数)」の取り扱いがあります。抽出した時点のデータセットでは徒歩とバスでの所要分数が入り混じっていました。

今回は、「最寄り駅まで徒歩で何分か」という基準で統一することにしたので、バス～分という場合は、バス～分に×4をして徒歩～分に置き換えています。

具体的には、バス10分であれば徒歩40分に置き換える。そんな感じです。

置き換えの根拠も載せてありますが、あくまで私なりの置き換え方法なので、何かアドバイスがある方はぜひ教えてください。

次回は、予測と物件リコメンドに入ります。

library(tidyverse)



# ファイルのPATHを指定して読み込む

strDirPath = 'スクレイピングしてきたデータが格納されているファイルPATHを各自記入'



# ファイルを一気に読み込んで一つのデータフレームにする

lf <- list.files(path = strDirPath, full.names = T)

data <- lapply(lf, read.csv, fileEncoding='cp932')

suumo_data <- do.call(rbind, data)



# 念のため、以下の列で重複がないか調べておく。

# 今回はデータ抽出後にLibreOfficeCalcで重複削除していたため重複はなかった。

suumo_data %>% distinct('name', 'rent1', 'rent2', 'layout', 'area','floor_height', .keep_all = TRUE)



# shiki_rei, access1, access2, access3, floor_heightについて分割処理を行う

suumo_data <- suumo_data %>%

  separate(shiki_rei, c('shikikin', 'reikin'), '/', remove = TRUE) %>%

  separate(access1, c('rout1', 'distance1'), '/', remove = TRUE) %>%

  separate(floor_heigth, c('floor', 'height'), '/', remove = TRUE)



# distance1はさらにstationtとdistanceに分割

suumo_data <- suumo_data %>%

  separate(distance1, c('station', 'distance'), '駅', remove = TRUE)



# 最寄駅にバス会社を選択している物件はdistanceがNAになる

# 最寄がバス会社で登録されている物件は登録数が非常に少ないかつ前提が大きく異なるような物件のためドロップ

suumo_data <- suumo_data[!is.na(suumo_data$distance),]



# rent2に含まれる文字列を0に置き換える

suumo_data$rent2 <- suumo_data$rent2 %>%

  str_replace('※定期借家', '') %>% 

  str_replace('-', '0')

# 数値で扱うデータは'-'を0にしておく

for (row in c(4, 5, 6, 7)){

  suumo_data[, row] <- suumo_data[, row] %>%

    str_replace('-', '0')

}



# rent1,shikikin,reikin,hoshoukin,shikibiki_shoukyakuに×10,000円して円単位にする

suumo_data$rent1 <- as.integer(suumo_data$rent1) * 10000

suumo_data$shikikin <- as.integer(suumo_data$shikikin) * 10000

suumo_data$reikin <- as.integer(suumo_data$reikin) * 10000

suumo_data$hoshoukin <- as.integer(suumo_data$hoshoukin) * 10000

suumo_data$shikibiki_shoukyaku <- as.integer(suumo_data$shikibiki_shoukyaku) * 10000



# rent2に含まれる文字列を0に置き換える

suumo_data$rent2 <- suumo_data$rent2 %>%

  str_replace('※定期借家', '') %>% 

  str_replace('-', '0')



# 数値で扱うデータは'-'を0にしておく

for (row in c(4, 5, 6, 7)){

  suumo_data[, row] <- suumo_data[, row] %>%

    str_replace('-', '0')

}



# 新築のageは「新」と表示されるため、0に置き換える

suumo_data$age[suumo_data$age == '新'] <- 0



# 平屋の物件はheightがNAになるため、1階建にする

suumo_data$height[is.na(suumo_data$height)] <- '1階建'



# floorの'平屋'は1階に変換→その後'階'を全部削除

# また、データ内の'0'と'建'と今回の'B'は誤りだと思われるため削除

suumo_data$floor <- suumo_data$floor %>%

  str_replace('平屋', '1階') %>% 

  str_replace('階', '') %>% 

  str_replace('0', '') %>% 

  str_replace('建', '') %>% 

  str_replace('B', '')



# 建物高さを抽出

# 地下は削除

suumo_data$height <- suumo_data$height %>%

  str_replace('地下1地上', '') %>% 

  str_replace('地下2地上', '') %>% 

  str_replace('地下3地上', '') %>% 

  str_replace('地下4地上', '') %>% 

  str_replace('地下5地上', '') %>% 

  str_replace('地下6地上', '') %>% 

  str_replace('地下7地上', '') %>% 

  str_replace('地下8地上', '') %>% 

  str_replace('地下9地上', '') %>% 

  str_replace('階建', '')



# kosuu,car,joukenの

						と

を除去（スクレイピング処理で誤って拾ってきた文字列の除去）

suumo_data$kosuu <- suumo_data$kosuu %>%

  str_replace('

						', '') %>% 

  str_replace('

', '')

suumo_data$car <- suumo_data$car %>%

  str_replace('

						', '') %>% 

  str_replace('

', '')

suumo_data$jouken <- suumo_data$jouken %>%

  str_replace('

						', '') %>% 

  str_replace('

', '') %>%

  str_replace('
						', '') %>% 

  str_replace('
', '')





# carを敷地内、近隣、付無料、無しに分類（金額情報は処理しない）

temp <- c()

for (row in suumo_data$car){

  if (str_detect(row, '敷地内')) {

    temp <- c(temp, '敷地内')

  } else if (str_detect(row, '近隣')){

    temp <- c(temp, '近隣')

  } else if (str_detect(row, '付無料')){

    temp <- c(temp, '付無料') 

  } else {

    temp <- c(temp, '無し') 

  }

}

suumo_data$car.dum <- temp



# フリーレントの実施有無

temp <- c()

for (row in suumo_data$jouken){

  if (str_detect(row, 'フリーレント')) {

    temp <- c(temp, 1)

  } else {

    temp <- c(temp, 0) 

  }

}

suumo_data$free_rent <- temp



# distanceの処理

# 最寄までバス～分または車～分の場合、徒歩～分に揃えるため×5する

# 一般に、徒歩80m/分。バスの場合、停車も加わるため時速20kmを分速換算し

# バス（車も）333m/分とする。333/80=4であるため、×4とする。家からバス停までの距離は無視。

suumo_data$distance_copy <- suumo_data$distance

suumo_data　<- suumo_data %>%

  separate(distance, c('distance', 'unused'), '分', remove = TRUE)

suumo_data$distance <- suumo_data$distance %>%

  str_replace('バス', '') %>%

  str_replace('歩', '') %>%

  str_replace('車', '')

temp <- c()

for (i in rep(1:length(suumo_data$distance_copy))){

  if (str_detect(suumo_data$distance_copy[i],'バス')) {

    temp <- c(temp, as.integer(suumo_data$distance[i]) * 4)

  }  else if (str_detect(suumo_data$distance_copy[i], '車')) {

    temp <- c(temp, as.integer(suumo_data$distance[i]) * 4)

  } else {

    temp <- c(temp, as.integer(suumo_data$distance[i]))

  }

}

suumo_data$distance <- temp


# モデル作成で使いたい変数のみ残す

tar <- c('name', 'rent1', 'rent2', 'shikikin', 'reikin',

         'hoshoukin', 'layout', 'area', 'direction',

         'type', 'age', 'rout1', 'station', 'distance',

         'construction', 'floor', 'height', 'car.dum', 'free_rent'

         )

suumo_data <- suumo_data[tar]



write.csv(suumo_data, 'お好みのファイル名',

          row.names = F, fileEncoding = 'cp932')