DNS와 IP 정보 얻기
DNS란?
IP정보?
nslookup 예시
# SCRAP DNS
import numpy as np
import pandas as pd
path="C:/Users/User/Downloads/2024_상품추천_scrapping/"
# Define the file path
file_path = 'C:/Users/User/Downloads/KED/[기업사업부-2024-DB0021][LG유플러스] 2023년 4분기DB CD제공 추출요청/1. 전체자료.txt'
# Define the column names that you want to load
columns_to_load = [0, 10]
# Define the chunk size
chunk_size = 100000 # Number of rows per chunk
# Create an empty DataFrame to hold chunks
df_chunks = []
i = 0
# Read the file in chunks
for chunk in pd.read_csv(file_path, sep='|', usecols=columns_to_load,
chunksize=chunk_size, encoding="mbcs", header=None, dtype=str):
i=i+1
print(i*chunk_size)
# Process each chunk (if necessary)
chunk = chunk.dropna()
# Append the chunk to the list
df_chunks.append(chunk)
# Concatenate the chunks together
data = pd.concat(df_chunks, ignore_index=True)
data.columns = ['KEDCD','URL']
data.to_csv(path+'01_urls.csv', index=False)
import dns.resolver
def get_dns_server_name(domain):
try:
# Extract the top-level domain
tld = domain.split("//")[-1].split("/")[0].split('?')[0]
# Query for NS records
answers = dns.resolver.resolve(tld, 'NS')
# Return the first authoritative name server
return str(answers[0])
except Exception as e:
return str(e)
#get_dns_server_name(data['URL'][1])
from datetime import datetime
start_time = datetime.now()
print(f"Start Time: {start_time}")
data['dns_server_name'] = data['URL'].apply(get_dns_server_name)
end_time = datetime.now()
print(f"End Time: {end_time}")
data.to_csv(path+'02_urls_with_dns.csv', index=False)
import socket
def get_dns_server(url):
try:
# Extract hostname from URL
hostname = url.split("//")[-1].split("/")[0].split('?')[0]
# Get IP address of the hostname
ip_address = socket.gethostbyname(hostname)
return ip_address
except Exception as e:
return str(e)
get_dns_server(data['URL'][0])
start_time = datetime.now()
print(f"Start Time: {start_time}")
data['dns_server_ip'] = data['URL'].apply(get_dns_server)
end_time = datetime.now()
print(f"End Time: {end_time}")
data.to_csv(path+'03 urls_with_ip.csv', index=False)
import requests
#url = 'https://api.findip.net/164.124.106.136/?token=a1a12781d7ef4a8db32fccc9d5982b53'
#params ={'serviceKey' : '서비스키', 'query' : 'kisa.or.kr', 'answer' : 'xml' }
def get_ip_info(ip):
#https://www.findip.net/Main
url = 'https://api.findip.net/'+ip+'/'
params = {'token': 'a1a12781d7ef4a8db32fccc9d5982b53'}
#url=' https://api.findip.net/164.124.106.136/?token=a1a12781d7ef4a8db32fccc9d5982b53'
response = requests.get(url, params=params, verify=False)
return response.content
#print(response.content)
import warnings
warnings.filterwarnings("ignore")
start_time = datetime.now()
print(f"Start Time: {start_time}")
#data['ip_info'] = data['URL'].apply(get_ip_info)
data['ip_info'] = ''
for i in range(171946,len(data)):
print (i)
data.loc[i,'ip_info'] = get_ip_info(data.loc[i,'dns_server_ip'])
print (data.loc[i,'ip_info'])
end_time = datetime.now()
print(f"End Time: {end_time}")
data.to_csv(path+'04 urls_with_ip_info.csv', index=False)
import json
from json.decoder import JSONDecodeError
err_list = []
#for i in range(60784,len(data)):
for i in err_list:
try:
tmp = json.loads(data.loc[i, 'ip_info'])
print(data.loc[i, 'ip_info'])
#print('City Name:', tmp['city']['names']['en'])
#print('Continent Code:', tmp['continent']['code'])
#print('Country Name:', tmp['country']['names']['en'])
#print('Latitude:', tmp['location']['latitude'])
#print('Longitude:', tmp['location']['longitude'])
#print('Time Zone:', tmp['location']['time_zone'])
#print('Weather Code:', tmp['location']['weather_code'])
data.loc[i,'city'] = tmp['city']['names']['en']
data.loc[i, 'continent'] = tmp['continent']['code']
data.loc[i, 'country'] = tmp['country']['names']['en']
data.loc[i, 'lat'] = tmp['location']['latitude']
data.loc[i, 'lon'] = tmp['location']['longitude']
data.loc[i, 'time_zone'] = tmp['location']['time_zone']
subd = ''
for subdivision in tmp['subdivisions']:
if 'en' in subdivision['names']:
#print('Subdivision Name:', subdivision['names']['en'])
subd = subd + '/' + subdivision['names']['en']
data.loc[i, 'subdivision'] = subd
#print('Autonomous System Number:', tmp['traits']['autonomous_system_number'])
#print('Autonomous System Organization:', tmp['traits']['autonomous_system_organization'])
#print('Connection Type:', tmp['traits']['connection_type'])
#print('ISP:', tmp['traits']['isp'])
#print('User Type:', tmp['traits']['user_type'])
data.loc[i,'as_number'] = tmp['traits']['autonomous_system_number']
data.loc[i,'as_org'] = tmp['traits']['autonomous_system_organization']
data.loc[i,'connection_type'] = tmp['traits']['connection_type']
data.loc[i,'isp'] = tmp['traits']['isp']
data.loc[i,'user_type'] = tmp['traits']['user_type']
except JSONDecodeError as e:
print('Error--')
data.loc[i, 'ip_info'] = get_ip_info(data.loc[i, 'dns_server_ip'])
print(data.loc[i, 'ip_info'])
err_list.append(i)
except KeyError:
print('Error')
except TypeError:
print('Error-')
data.to_csv(path+'06 urls_with_ip_info_orgez.csv', index=False)
data = pd.read_csv(path+'06 urls_with_ip_info_orgez.csv', dtype='str')
import re
import numpy as np
# Function to extract based on the specified condition
def extract_domain(row):
url_pattern = r'^(?!(The DNS|All nameservers|The resolution))(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
regex = re.compile(url_pattern)
if regex.match(row):
if row.endswith('.co.kr.') or row.endswith('.or.kr.') or row.endswith('.go.kr.') or row.endswith('.co.uk.') or row.endswith('.ne.kr.') :
match = pd.Series(row).str.extract(r'([a-zA-Z0-9-]+\.[a-zA-Z0-9-]+\.[a-zA-Z]{2,})\.$')
else:
match = pd.Series(row).str.extract(r'([a-zA-Z0-9-]+\.[a-zA-Z]{2,})\.$')
return match.iloc[0, 0]
else:
return np.nan
# #url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
# url_pattern = r'^(?!(The DNS|All nameservers|The resolution))(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
# filtered_df = data[data['dns_server_name'].str.contains(url_pattern, flags=re.IGNORECASE, na=False)]
# filtered_df.groupby('dns_server_name').count()
#
# # Apply the function to the DataFrame
# filtered_df['extracted_domain'] = filtered_df['dns_server_name'].apply(extract_domain)
# filtered_df.groupby('extracted_domain').count()
# filtered_df.to_csv(path+'07 url_with_ip_info_orgez_dns_exists_filtered.csv', index=False)
data['extracted_domain'] = data['dns_server_name'].apply(extract_domain)
data.to_csv(path+'07 url_with_ip_info_orgez_dns_2nd_level.csv', index=False)
data.head()
data2 = data.drop('ip_info',axis=1)
data2.to_csv(path+'08 url_with_ip_info_orgez_dns_2nd_level_wo_ip_info.csv', index=False)
Discussion