location-importer.in: skip networks with unknown country codes

Message ID 20201030143510.6514-1-peter.mueller@ipfire.org
State Superseded
Headers show
Series
  • location-importer.in: skip networks with unknown country codes
Related show

Commit Message

Peter Müller Oct. 30, 2020, 2:35 p.m. UTC
There is no sense in parsing and storting networks whose country codes
cannot be found in the ISO-3166-x country code table. This avoids side
effects in applications using the location database, and introduces
another sanity check to compensate bogus RIR data.

On location02, this affects some networks from APNIC (country code: ZZ)
as well as a bunch of smaller allocations within the RIPE region still
tagged to CS or YU (Yugoslavia). To my surprise, no network tagged as SU
(Soviet Union) was found - while the NIC for .su TLD is still
operational. :-)

Fixes: #12510

Signed-off-by: Peter Müller <peter.mueller@ipfire.org>
---
 src/python/location-importer.in | 42 ++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 14 deletions(-)

Comments

Peter Müller Jan. 26, 2021, 3:34 p.m. UTC | #1
Hello Michael,

if I got this right, this patch still waits acceptance/rejection, which is why I just
wanted to bring it up again. :-)

Thanks, and best regards,
Peter Müller

> There is no sense in parsing and storting networks whose country codes
> cannot be found in the ISO-3166-x country code table. This avoids side
> effects in applications using the location database, and introduces
> another sanity check to compensate bogus RIR data.
> 
> On location02, this affects some networks from APNIC (country code: ZZ)
> as well as a bunch of smaller allocations within the RIPE region still
> tagged to CS or YU (Yugoslavia). To my surprise, no network tagged as SU
> (Soviet Union) was found - while the NIC for .su TLD is still
> operational. :-)
> 
> Fixes: #12510
> 
> Signed-off-by: Peter Müller <peter.mueller@ipfire.org>
> ---
>  src/python/location-importer.in | 42 ++++++++++++++++++++++-----------
>  1 file changed, 28 insertions(+), 14 deletions(-)
> 
> diff --git a/src/python/location-importer.in b/src/python/location-importer.in
> index 864eab1..89b556a 100644
> --- a/src/python/location-importer.in
> +++ b/src/python/location-importer.in
> @@ -388,10 +388,17 @@ class CLI(object):
>  				TRUNCATE TABLE networks;
>  			""")
>  
> +			# Fetch all valid country codes to check parsed networks aganist...
> +			rows = self.db.query("SELECT * FROM countries ORDER BY country_code")
> +			validcountries = []
> +
> +			for row in rows:
> +				validcountries.append(row.country_code)
> +
>  			for source in location.importer.WHOIS_SOURCES:
>  				with downloader.request(source, return_blocks=True) as f:
>  					for block in f:
> -						self._parse_block(block)
> +						self._parse_block(block, validcountries)
>  
>  			# Process all parsed networks from every RIR we happen to have access to,
>  			# insert the largest network chunks into the networks table immediately...
> @@ -467,7 +474,7 @@ class CLI(object):
>  				# Download data
>  				with downloader.request(source) as f:
>  					for line in f:
> -						self._parse_line(line)
> +						self._parse_line(line, validcountries)
>  
>  	def _check_parsed_network(self, network):
>  		"""
> @@ -532,7 +539,7 @@ class CLI(object):
>  		# be suitable for libloc consumption...
>  		return True
>  
> -	def _parse_block(self, block):
> +	def _parse_block(self, block, validcountries = None):
>  		# Get first line to find out what type of block this is
>  		line = block[0]
>  
> @@ -542,7 +549,7 @@ class CLI(object):
>  
>  		# inetnum
>  		if line.startswith("inet6num:") or line.startswith("inetnum:"):
> -			return self._parse_inetnum_block(block)
> +			return self._parse_inetnum_block(block, validcountries)
>  
>  		# organisation
>  		elif line.startswith("organisation:"):
> @@ -573,7 +580,7 @@ class CLI(object):
>  			autnum.get("asn"), autnum.get("org"),
>  		)
>  
> -	def _parse_inetnum_block(self, block):
> +	def _parse_inetnum_block(self, block, validcountries = None):
>  		log.debug("Parsing inetnum block:")
>  
>  		inetnum = {}
> @@ -624,17 +631,17 @@ class CLI(object):
>  		if not inetnum or not "country" in inetnum:
>  			return
>  
> -		# Skip objects with bogus country code 'ZZ'
> -		if inetnum.get("country") == "ZZ":
> -			log.warning("Skipping network with bogus country 'ZZ': %s" % \
> -				(inetnum.get("inet6num") or inetnum.get("inetnum")))
> -			return
> -
>  		network = ipaddress.ip_network(inetnum.get("inet6num") or inetnum.get("inetnum"), strict=False)
>  
>  		if not self._check_parsed_network(network):
>  			return
>  
> +		# Skip objects with unknown country codes
> +		if validcountries and inetnum.get("country") not in validcountries:
> +			log.warning("Skipping network with bogus country '%s': %s" % \
> +				(inetnum.get("country"), inetnum.get("inet6num") or inetnum.get("inetnum")))
> +			return
> +
>  		self.db.execute("INSERT INTO _rirdata(network, country) \
>  			VALUES(%s, %s) ON CONFLICT (network) DO UPDATE SET country = excluded.country",
>  			"%s" % network, inetnum.get("country"),
> @@ -659,7 +666,7 @@ class CLI(object):
>  			org.get("organisation"), org.get("org-name"),
>  		)
>  
> -	def _parse_line(self, line):
> +	def _parse_line(self, line, validcountries = None):
>  		# Skip version line
>  		if line.startswith("2"):
>  			return
> @@ -674,8 +681,15 @@ class CLI(object):
>  			log.warning("Could not parse line: %s" % line)
>  			return
>  
> -		# Skip any lines that are for stats only
> -		if country_code == "*":
> +		# Skip any lines that are for stats only or do not have a country
> +		# code at all (avoids log spam below)
> +		if not country_code or country_code == '*':
> +			return
> +
> +		# Skip objects with unknown country codes
> +		if validcountries and country_code not in validcountries:
> +			log.warning("Skipping line with bogus country '%s': %s" % \
> +				(country_code, line))
>  			return
>  
>  		if type in ("ipv6", "ipv4"):
>
Peter Müller Feb. 4, 2021, 5:32 p.m. UTC | #2
*cough* :-)

> Hello Michael,
> 
> if I got this right, this patch still waits acceptance/rejection, which is why I just
> wanted to bring it up again. :-)
> 
> Thanks, and best regards,
> Peter Müller
> 
>> There is no sense in parsing and storting networks whose country codes
>> cannot be found in the ISO-3166-x country code table. This avoids side
>> effects in applications using the location database, and introduces
>> another sanity check to compensate bogus RIR data.
>>
>> On location02, this affects some networks from APNIC (country code: ZZ)
>> as well as a bunch of smaller allocations within the RIPE region still
>> tagged to CS or YU (Yugoslavia). To my surprise, no network tagged as SU
>> (Soviet Union) was found - while the NIC for .su TLD is still
>> operational. :-)
>>
>> Fixes: #12510
>>
>> Signed-off-by: Peter Müller <peter.mueller@ipfire.org>
>> ---
>>  src/python/location-importer.in | 42 ++++++++++++++++++++++-----------
>>  1 file changed, 28 insertions(+), 14 deletions(-)
>>
>> diff --git a/src/python/location-importer.in b/src/python/location-importer.in
>> index 864eab1..89b556a 100644
>> --- a/src/python/location-importer.in
>> +++ b/src/python/location-importer.in
>> @@ -388,10 +388,17 @@ class CLI(object):
>>  				TRUNCATE TABLE networks;
>>  			""")
>>  
>> +			# Fetch all valid country codes to check parsed networks aganist...
>> +			rows = self.db.query("SELECT * FROM countries ORDER BY country_code")
>> +			validcountries = []
>> +
>> +			for row in rows:
>> +				validcountries.append(row.country_code)
>> +
>>  			for source in location.importer.WHOIS_SOURCES:
>>  				with downloader.request(source, return_blocks=True) as f:
>>  					for block in f:
>> -						self._parse_block(block)
>> +						self._parse_block(block, validcountries)
>>  
>>  			# Process all parsed networks from every RIR we happen to have access to,
>>  			# insert the largest network chunks into the networks table immediately...
>> @@ -467,7 +474,7 @@ class CLI(object):
>>  				# Download data
>>  				with downloader.request(source) as f:
>>  					for line in f:
>> -						self._parse_line(line)
>> +						self._parse_line(line, validcountries)
>>  
>>  	def _check_parsed_network(self, network):
>>  		"""
>> @@ -532,7 +539,7 @@ class CLI(object):
>>  		# be suitable for libloc consumption...
>>  		return True
>>  
>> -	def _parse_block(self, block):
>> +	def _parse_block(self, block, validcountries = None):
>>  		# Get first line to find out what type of block this is
>>  		line = block[0]
>>  
>> @@ -542,7 +549,7 @@ class CLI(object):
>>  
>>  		# inetnum
>>  		if line.startswith("inet6num:") or line.startswith("inetnum:"):
>> -			return self._parse_inetnum_block(block)
>> +			return self._parse_inetnum_block(block, validcountries)
>>  
>>  		# organisation
>>  		elif line.startswith("organisation:"):
>> @@ -573,7 +580,7 @@ class CLI(object):
>>  			autnum.get("asn"), autnum.get("org"),
>>  		)
>>  
>> -	def _parse_inetnum_block(self, block):
>> +	def _parse_inetnum_block(self, block, validcountries = None):
>>  		log.debug("Parsing inetnum block:")
>>  
>>  		inetnum = {}
>> @@ -624,17 +631,17 @@ class CLI(object):
>>  		if not inetnum or not "country" in inetnum:
>>  			return
>>  
>> -		# Skip objects with bogus country code 'ZZ'
>> -		if inetnum.get("country") == "ZZ":
>> -			log.warning("Skipping network with bogus country 'ZZ': %s" % \
>> -				(inetnum.get("inet6num") or inetnum.get("inetnum")))
>> -			return
>> -
>>  		network = ipaddress.ip_network(inetnum.get("inet6num") or inetnum.get("inetnum"), strict=False)
>>  
>>  		if not self._check_parsed_network(network):
>>  			return
>>  
>> +		# Skip objects with unknown country codes
>> +		if validcountries and inetnum.get("country") not in validcountries:
>> +			log.warning("Skipping network with bogus country '%s': %s" % \
>> +				(inetnum.get("country"), inetnum.get("inet6num") or inetnum.get("inetnum")))
>> +			return
>> +
>>  		self.db.execute("INSERT INTO _rirdata(network, country) \
>>  			VALUES(%s, %s) ON CONFLICT (network) DO UPDATE SET country = excluded.country",
>>  			"%s" % network, inetnum.get("country"),
>> @@ -659,7 +666,7 @@ class CLI(object):
>>  			org.get("organisation"), org.get("org-name"),
>>  		)
>>  
>> -	def _parse_line(self, line):
>> +	def _parse_line(self, line, validcountries = None):
>>  		# Skip version line
>>  		if line.startswith("2"):
>>  			return
>> @@ -674,8 +681,15 @@ class CLI(object):
>>  			log.warning("Could not parse line: %s" % line)
>>  			return
>>  
>> -		# Skip any lines that are for stats only
>> -		if country_code == "*":
>> +		# Skip any lines that are for stats only or do not have a country
>> +		# code at all (avoids log spam below)
>> +		if not country_code or country_code == '*':
>> +			return
>> +
>> +		# Skip objects with unknown country codes
>> +		if validcountries and country_code not in validcountries:
>> +			log.warning("Skipping line with bogus country '%s': %s" % \
>> +				(country_code, line))
>>  			return
>>  
>>  		if type in ("ipv6", "ipv4"):
>>

Patch

diff --git a/src/python/location-importer.in b/src/python/location-importer.in
index 864eab1..89b556a 100644
--- a/src/python/location-importer.in
+++ b/src/python/location-importer.in
@@ -388,10 +388,17 @@  class CLI(object):
 				TRUNCATE TABLE networks;
 			""")
 
+			# Fetch all valid country codes to check parsed networks aganist...
+			rows = self.db.query("SELECT * FROM countries ORDER BY country_code")
+			validcountries = []
+
+			for row in rows:
+				validcountries.append(row.country_code)
+
 			for source in location.importer.WHOIS_SOURCES:
 				with downloader.request(source, return_blocks=True) as f:
 					for block in f:
-						self._parse_block(block)
+						self._parse_block(block, validcountries)
 
 			# Process all parsed networks from every RIR we happen to have access to,
 			# insert the largest network chunks into the networks table immediately...
@@ -467,7 +474,7 @@  class CLI(object):
 				# Download data
 				with downloader.request(source) as f:
 					for line in f:
-						self._parse_line(line)
+						self._parse_line(line, validcountries)
 
 	def _check_parsed_network(self, network):
 		"""
@@ -532,7 +539,7 @@  class CLI(object):
 		# be suitable for libloc consumption...
 		return True
 
-	def _parse_block(self, block):
+	def _parse_block(self, block, validcountries = None):
 		# Get first line to find out what type of block this is
 		line = block[0]
 
@@ -542,7 +549,7 @@  class CLI(object):
 
 		# inetnum
 		if line.startswith("inet6num:") or line.startswith("inetnum:"):
-			return self._parse_inetnum_block(block)
+			return self._parse_inetnum_block(block, validcountries)
 
 		# organisation
 		elif line.startswith("organisation:"):
@@ -573,7 +580,7 @@  class CLI(object):
 			autnum.get("asn"), autnum.get("org"),
 		)
 
-	def _parse_inetnum_block(self, block):
+	def _parse_inetnum_block(self, block, validcountries = None):
 		log.debug("Parsing inetnum block:")
 
 		inetnum = {}
@@ -624,17 +631,17 @@  class CLI(object):
 		if not inetnum or not "country" in inetnum:
 			return
 
-		# Skip objects with bogus country code 'ZZ'
-		if inetnum.get("country") == "ZZ":
-			log.warning("Skipping network with bogus country 'ZZ': %s" % \
-				(inetnum.get("inet6num") or inetnum.get("inetnum")))
-			return
-
 		network = ipaddress.ip_network(inetnum.get("inet6num") or inetnum.get("inetnum"), strict=False)
 
 		if not self._check_parsed_network(network):
 			return
 
+		# Skip objects with unknown country codes
+		if validcountries and inetnum.get("country") not in validcountries:
+			log.warning("Skipping network with bogus country '%s': %s" % \
+				(inetnum.get("country"), inetnum.get("inet6num") or inetnum.get("inetnum")))
+			return
+
 		self.db.execute("INSERT INTO _rirdata(network, country) \
 			VALUES(%s, %s) ON CONFLICT (network) DO UPDATE SET country = excluded.country",
 			"%s" % network, inetnum.get("country"),
@@ -659,7 +666,7 @@  class CLI(object):
 			org.get("organisation"), org.get("org-name"),
 		)
 
-	def _parse_line(self, line):
+	def _parse_line(self, line, validcountries = None):
 		# Skip version line
 		if line.startswith("2"):
 			return
@@ -674,8 +681,15 @@  class CLI(object):
 			log.warning("Could not parse line: %s" % line)
 			return
 
-		# Skip any lines that are for stats only
-		if country_code == "*":
+		# Skip any lines that are for stats only or do not have a country
+		# code at all (avoids log spam below)
+		if not country_code or country_code == '*':
+			return
+
+		# Skip objects with unknown country codes
+		if validcountries and country_code not in validcountries:
+			log.warning("Skipping line with bogus country '%s': %s" % \
+				(country_code, line))
 			return
 
 		if type in ("ipv6", "ipv4"):