Hide
Free your code from a slow death on your hard drive Join Siafoo Now or Learn More

Utility for loading GeoIP data into a PostgreSQL database Atom Feed 0

In Brief This is a script for loading the MaxMind GeoIP data into an PostgreSQL database. The script will create the database structure, load the data, add some utility functions and create the necessary indexes.... more
# 's
  1#!/usr/bin/python
2
3'''
4Script for loading GeoIP CSV data into a postgresql database
5'''
6
7import logging, psycopg2, psycopg2.extensions, sys
8
9from optparse import OptionGroup, OptionParser
10from StringIO import StringIO
11
12class GeoIPDataLoader(object):
13
14 def __init__(self, dsn, blocks='GeoLiteCity-Blocks.csv', locations='GeoLiteCity-Location.csv', schema='public', zip=None):
15 self.con = psycopg2.connect(dsn)
16 # We don't need transactions... right?
17 self.con.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
18 # The data is in ISO8859_15 encoding
19 self.con.set_client_encoding('iso8859_15')
20 self.cur = self.con.cursor()
21
22 self.blocks_csv = blocks
23 self.location_csv = locations
24 self.zip = zip
25 self.schema = schema
26
27 def close(self):
28 self.con.close()
29
30 def create_tables(self):
31 print 'Creating structure...',
32 self.db_execute(
33 '''
34 CREATE TABLE locations
35 (
36 id bigint NOT NULL,
37 country character(2) NOT NULL,
38 region character(2),
39 city character varying(75),
40 postal_code character varying(15),
41 latitude numeric(6,4) NOT NULL,
42 longitude numeric(7,4),
43 metro_code integer,
44 area_code integer,
45 CONSTRAINT locations_pkey PRIMARY KEY (id)
46 );
47
48 CREATE TABLE blocks
49 (
50 start_ip bigint NOT NULL,
51 end_ip bigint NOT NULL,
52 location_id bigint NOT NULL
53 );
54
55 '''
56 )
57 print '\033[1;32mDone\033[1;m'
58
59 def create_indexes(self, ip4=False):
60 print 'Adding Indexes...',
61 sys.stdout.flush()
62 if not ip4:
63 self.db_execute('''
64 CREATE INDEX ix_start_end_ip ON blocks
65 USING btree (start_ip, end_ip) WITH (FILLFACTOR=100);
66 CREATE INDEX ix_end_start_ip ON blocks
67 USING btree (end_ip, start_ip) WITH (FILLFACTOR=100);
68 ''')
69 else:
70 self.db_execute('''
71 CREATE INDEX ix_ip_range ON blocks
72 USING gist (ip_range) WITH (FILLFACTOR=100);
73 ''')
74 print '\033[1;32mDone\033[1;m'
75
76 def create_functions(self, ip4=False):
77 print 'Adding utility functions...',
78 sys.stdout.flush()
79 if ip4:
80 self.db_execute('''
81 CREATE OR REPLACE FUNCTION get_location(inet) RETURNS bigint AS $$
82 SELECT location_id FROM %s.blocks
83 WHERE ip_range >>= ip4($1)
84 LIMIT 1
85 $$ LANGUAGE SQL;
86 ''' % self.schema)
87 else:
88 self.db_execute('''
89 CREATE OR REPLACE FUNCTION inet_to_bigint(inet) RETURNS bigint AS $$
90 SELECT $1 - inet '0.0.0.0'
91 $$ LANGUAGE SQL;
92 ''')
93 print '\033[1;32mDone\033[1;m'
94
95 def create_schema(self):
96 try:
97 self.db_execute('''CREATE SCHEMA %s;''' % self.schema)
98 except psycopg2.ProgrammingError:
99 pass
100
101 self.db_execute('SET search_path TO %s,public;' % self.schema)
102
103 def db_execute(self, ddl):
104 self.cur.execute(ddl)
105# self.con.commit()
106
107 def load_data(self):
108
109 if self.zip:
110 # Something more clever can be done here... but maybe... later
111 from zipfile import ZipFile
112
113 zip = ZipFile(self.zip)
114
115 for z in zip.infolist():
116 if z.filename.endswith(self.location_csv):
117 self.load_table(z.filename, 'locations', data_file=StringIO(zip.read(z.filename)))
118 elif z.filename.endswith(self.blocks_csv):
119 self.load_table(z.filename, 'blocks', data_file=StringIO(zip.read(z.filename)))
120 else:
121 # Load Locations
122 self.load_table(self.location_csv, 'locations')
123 # Load Blocks
124 self.load_table(self.blocks_csv, 'blocks')
125
126 def load_table(self, file_name, table_name, data_file=None):
127 print 'Loading table \033[1;34m%s\033[1;m from file \033[1;34m%s\033[1;m...' % (table_name, file_name),
128 sys.stdout.flush()
129
130 if not data_file:
131 data_file = open(file_name)
132
133 # Skip the copyright header
134 data_file.readline()
135 data_file.readline()
136 #Remove quotes... psycopg2's `copy` errors on them
137 string_data = data_file.read().replace('"', '')
138
139 self.cur.copy_from(StringIO(string_data), table_name, sep=',', null='')
140 print '\033[1;32mDone\033[1;m'
141
142 def migrate_to_ip4(self):
143 print 'Adding ip_range column'
144 self.db_execute('''
145 ALTER TABLE blocks ADD COLUMN ip_range ip4r;
146 ALTER TABLE blocks ALTER COLUMN ip_range SET STORAGE PLAIN;
147 ''')
148
149 print 'Migrating data to ip4...',
150 sys.stdout.flush()
151 self.db_execute('''UPDATE blocks SET ip_range = ip4r(start_ip::ip4, end_ip::ip4)''')
152 print '\033[1;32mDone\033[1;m'
153
154 print 'Dropping unneeded columns'
155 self.db_execute('''
156 ALTER TABLE blocks DROP COLUMN start_ip;
157 ALTER TABLE blocks DROP COLUMN end_ip;
158 ''')
159 def vacuum(self):
160 print 'Vaccuming database...',
161 sys.stdout.flush()
162 self.db_execute('VACUUM FULL ANALYZE')
163 print '\033[1;32mDone\033[1;m'
164
165def main():
166 DSN = "dbname='%s' user='%s' host='%s'"
167
168 parser = OptionParser()
169 # Operational options
170 parser.add_option('-c', '--load-ddl', dest='load_ddl', default=False,
171 action='store_true', help='Create database structure')
172
173 parser.add_option('-g', '--load-data', dest='load', default=False,
174 action='store_true', help='Load the GeoIP data')
175
176 parser.add_option('-b', '--blocks-file', dest='blocks_csv', default='GeoLiteCity-Blocks.csv',
177 action='store', help='GeoIP Blocks CSV file [default: %default]', metavar='BLOCKS_FILE')
178 parser.add_option('-l', '--locations-file', dest='locations_csv', default='GeoLiteCity-Location.csv',
179 action='store', help='GoeIP Locations CSV file [default: %default]', metavar='LOCATIONS_FILE')
180
181 parser.add_option('-z', '--zip', dest='zip',
182 action='store', help='GoeIP Locations ZIP file [default: %default]', metavar='ZIP_FILE')
183
184 db_group = OptionGroup(parser, 'Database Options')
185 # Database options
186 db_group.add_option('-H', '--host', dest='db_host', default='localhost',
187 action='store', help='Database host [default: %default]', metavar='DB_HOST')
188 db_group.add_option('-d', '--database', dest='db_name', default='geoip_db',
189 action='store', help='Database name [default: %default]', metavar='DATABASE_NAME')
190 db_group.add_option('-U', '--user', dest='db_user', default='geoip',
191 action='store', help='User [default: %default]', metavar='USER_NAME')
192 db_group.add_option('-s', '--schema', dest='schema', default='public',
193 action='store', help='Database Schema [default: %default]', metavar='SCHEMA')
194
195 db_group.add_option('--ip4r', dest='ip4', default=False,
196 action='store_true', help='Use IP4r module [default: %default]')
197
198 parser.add_option_group(db_group)
199
200 (options, args) = parser.parse_args()
201
202 data_loader = GeoIPDataLoader("dbname='%s' user='%s' host='%s'" % (options.db_name, options.db_user, options.db_host),
203 blocks=options.blocks_csv, locations=options.locations_csv, zip=options.zip, schema=options.schema)
204
205 if options.ip4 and not options.load and options.load_ddl:
206 print '\033[1;31mERROR\033[1;m Creating a raw IP4 schema breaks data loading. Use --ip4r switch ONLY during or after loading data'
207 return
208
209 if not options.load_ddl and not options.load:
210 parser.print_help()
211 return
212
213 if options.load_ddl:
214 if options.schema != 'public':
215 data_loader.create_schema()
216 data_loader.create_tables()
217
218 if options.load:
219 data_loader.load_data()
220
221 if options.ip4:
222 data_loader.migrate_to_ip4()
223
224 if options.load:
225 data_loader.create_indexes(options.ip4 is True)
226
227 if options.load_ddl:
228 data_loader.create_functions(options.ip4 is True)
229
230 data_loader.vacuum()
231
232if __name__ == "__main__":
233 main()

This is a script for loading the MaxMind GeoIP data into an PostgreSQL database. The script will create the database structure, load the data, add some utility functions and create the necessary indexes.

The following command will create the structure and load the data from the given files into a database named geoip_db (the default)

python load_geoip.py -c -g -z GeoLiteCity_20090501.zip

if the above command fails try unziping the archive and providing the names of the individual files:

python load_geoip.py -c -g -b ./GeoLiteCity_20090501/GeoLiteCity-Blocks.csv -l ./GeoLiteCity_20090501/GeoLiteCity-Location.csv

Run with --help to get usage options and database connection parameters

Note

If you get "psycopg2.DataError: extra data after last expected column" drop the tables and start again. (The error means that there aren't enough columns in the table to fit the data... in our case the loading script dropped the columns because of ip4r )