Commit 27a4cdb8 authored by Craig Earley's avatar Craig Earley
Browse files

sql script that creates tables and populates some of them from the raw csv files of the 911 data

parent a8abd1e9
begin;
-- -- Create the tables
create table calls_csv (call_id char(15), date date, time time without time zone, complaint varchar, address varchar, city varchar, lat real, long real );
create table response_csv (call_id char(15), abbrev varchar, name varchar);
-- -- Populate them from the raw and uncleaned csv
\copy calls_csv from '/cluster/home/datascience/911calls/raw/2015calls.csv' with delimiter ',' csv header;
\copy response_csv from '/cluster/home/datascience/911calls/raw/departmentsdispatched.csv' with delimiter ',' csv;
-- set to initcap so as to make life easier
update calls_csv set city=initcap(city);
-- -- do something to get the additional geocodes we need: postgis? clean up, geocode
-- -- Update all the things
-- update agencies set agency_type = 'law' where agency_name like '%POLICE%';
-- update agencies set agency_type = 'fire' where agency_name like '%FIRE%';
-- update agencies set agency_type = 'law' where agency_name like '%SHERIFF%';
-- Make cities consistent and correct
update calls_csv set city='Richmond' where city like 'Rich%';
update calls_csv set city='Richmond' where city like '%chmond';
update calls_csv set city='Greens Fork' where city like 'Green%ork';
update calls_csv set city='Hagerstown' where city like 'Hage%';
update calls_csv set city='Boston' where city like 'Bost%';
update calls_csv set city='Connersville' where city like 'Connor%';
update calls_csv set city='Cambridge City' where city like 'Cambr%';
update calls_csv set city='New Paris' where city like 'New Paris%';
update calls_csv set city='Union City' where city like 'Union%';
update calls_csv set city='Randolph County' where city like 'Rando%';
update calls_csv set city='Fayette County' where city like 'Fayette%';
update calls_csv set city='Richmond' where city like '1 S 9th%';
update calls_csv set city='Economy' where city like 'Econ%';
update calls_csv set city='Centerville' where city like 'Cnet';
update calls_csv set city='Richmond' where city='Q';
update calls_csv set city='Richmond' where city='Wtt861';
update calls_csv set city='Farmland Area' where city like 'Farmland%';
update calls_csv set city='Richmond' where address like 'RPD';
update calls_csv set city='Richmond' where address like '18 N 22ND ST' and city like ' %';
-- Some things that are generic enough to update to Wayne County alone
update calls_csv set city='Wayne County' where city='Your City';
update calls_csv set city='Wayne County' where city='Unk';
update calls_csv set city='Wayne County' where city='Pu';
-- Very specific problems
update calls_csv set city='Richmond' where address like '%SW 18TH%';
update calls_csv set address= 'Bertschland, 1 E Church St' where city='1 E Church St';
update calls_csv set city='Cambridge City' where city='1 E Church St';
-- -- Modify the response_csv table
update response_csv set name=initcap(name);
update response_csv set name='Negotiators' where name like 'Negg%';
-- -- if no errors, insert the clean data into the appropriate postgres tables
-- -- spread the data downstream to the derived tables
-- -- Take care of calls_csv
-- want to use property cards for the ID over time
create table place (place_id serial unique, addr varchar, lat real, long real, city varchar, state varchar);
\copy place (lat, long, addr, city, state) from '/cluster/home/cjearley13/geocodes.csv' with delimiter ',' csv header;
insert into place (addr, lat, long, city) select distinct address, lat, long, city from calls_csv;
-- -- locations are processed, now use that
create table calls (call_id char(15), place_id real, date date, time time, type_name char(15));
-- fix this one:
insert into calls (call_id, place_id, date, time) select calls_csv.call_id, place.place_id, calls_csv.date, calls_csv.time from calls_csv inner join place on place.addr = calls_csv.address;
-- debug
create view address_align as select calls.call_id, calls.place_id, place.addr, place.place_id as place from calls inner join place on calls.place_id=place.place_id;
-- -- incident hierarchy
create table call_type (complaint_name varchar, type_name char(15), category_id char(15));
insert into call_type (complaint_name, type_name) select distinct upper(complaint), upper(complaint) from calls_csv;
-- -- clean up the complaints
update call_type set type_name = '10-15' where complaint_name like '10-15%';
update call_type set type_name = '10-16' where complaint_name like '10-16%';
update call_type set type_name = '10-31' where complaint_name like '10-31%';
update call_type set type_name = '10-37' where complaint_name like '10-37%';
update call_type set type_name = '10-50' where complaint_name like '10-50%';
update call_type set type_name = '10-52' where complaint_name like '10-52%';
update call_type set type_name = '10-53' where complaint_name like '10-53%';
update call_type set type_name = '10-58' where complaint_name like '10-58%';
update call_type set type_name = '10-59' where complaint_name like '10-59%';
update call_type set type_name = '10-70' where complaint_name like '10-70%';
update call_type set type_name = '10-97' where complaint_name like '10-97%';
update call_type set type_name = '10-11' where complaint_name like '10-11%';
update call_type set type_name = '10-90' where complaint_name like '10-90%';
create table call_category (category_id serial unique, category_name varchar);
-- insert into call_category (category_name) select cat names;
-- update call_type set call_type.category_id = call_category.category_id;
create table call_group (group_id real, group_name varchar);
-- insert into call_category (category_name) select cat names;
-- update call_type set call_type.category_id = call_category.category_id;
-- -- Now take care of response_csv
-- -- agency processing: create the tables, populate the agency table, and from those id's populate the response table
create table agency (agency_id serial unique, agency_abbrev varchar, agency_name varchar, city varchar);
insert into agency (agency_abbrev, agency_name) select distinct abbrev,name from response_csv;
create table agency_type (agency_id serial unique, type_name char(15));
create table response (call_id char(15), agency_id real);
insert into response (call_id, agency_id) select calls_csv.call_id, agency.agency_id from calls_csv inner join response_csv on response_csv.call_id=calls_csv.call_id inner join agency on response_csv.abbrev=agency.agency_abbrev;
commit;
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment