깐죽이의 정보 바다

본격적인 네이버 블로그 수집 하여 게시판에 등록하는 작업을 시작해보자. 

 

작업순서

디비 생성 > 네이버 목록 코드 수집 > 네이버 크롤링후 게시판등록

 

1. 디비생성

데이터베이스에 글을 어디까지 수집을 했는지 저장해주는 테이블을 생성해주자

 

테이블명 : last_num

필드설명

idx - 고유값

last_num 최종값 저장

site_name 사이명저장

CREATE TABLE `last_num` (
  `idx` int(2) NOT NULL AUTO_INCREMENT COMMENT '고유값',
  `last_num` int(11) NOT NULL DEFAULT 0 COMMENT '마지막코드',
  `site_name` char(100) NOT NULL DEFAULT '' COMMENT '사이트명',
  PRIMARY KEY (`idx`) COMMENT '프라이머리키'
) ENGINE=MyISAM DEFAULT CHARSET=utf8;

 

2. 네이버 목록 코드 수집

네이버 목록은 json 형태의 데이터로 목록 리스트를 가져온다. 

 

목록필드 설명

주소 : https://blog.naver.com/PostTitleListAsync.naver?blogId=blk_lg_ryan&viewdate=¤tPage=52&categoryNo=0&parentCategoryNo=0&countPerPage=30 

 

blogId - 블로그 아이디

countPerPage = 현재 페이지 주소

countPerPage = 표시 페이지 갯수

{"resultCode":"S","resultMessage":"","postList":[{"sellerServiceStatus":"","logNo":"221980384074","title":"%5B%EC%9A%94%EB%BD%80%EB%81%BC+%EB%A7%A4%EC%BD%A4%EB%8B%AC%EC%BD%A4+%EB%96%A1%EB%B3%B6%EC%9D%B4%5D+%ED%95%9C%EA%B5%AD%EC%9D%8C%EC%8B%9D%EC%9D%B4+%EC%99%B8%EA%B5%AD%EC%9D%B8%EC%9D%98+%EC%9E%85%EB%A7%9B%EC%97%90+%EB%A7%9E%EB%8B%A4%EB%A9%B4%2C+%ED%95%9C%EA%B5%AD%EC%82%AC%EB%9E%8C%EC%97%90%EA%B2%90+%EC%95%88+%EB%A7%9E%EC%9D%84+%EC%88%98+%EC%9E%88%EC%96%B4%EC%9A%94%21","categoryNo":"38","parentCategoryNo":"5","sourceCode":"0","commentCount":"14","readCount":"","addDate":"2020. 5. 28.","openType":"2","searchYn":"true","greenReviewBannerYn":false,"memologMovingYn":"0","isPostSelectable":"0","isPostNotOpen":"0","isPostBlocked":0,"isBlockTmpForced":0,"postProductStatus":""},{"sellerServiceStatus":"","logNo":"221980953149","title":"%5B%ED%94%BC%EC%BD%94%ED%81%AC+%EB%82%98%EA%B3%A0%EC%95%BC%EC%8B%9D+%EC%B9%98%ED%82%A8%EC%9C%99%5D+%EC%B9%98%ED%82%A8%EC%9C%99%EC%97%90+%EB%8C%80%ED%95%9C+%EA%B3%A0%EC%A0%95%EA%B4%80%EB%85%90%EC%9D%84+%EC%9E%A0%EC%8B%9C+%EB%82%B4%EB%A0%A4%EB%86%93%EC%95%84%EB%8F%84+%EC%A2%8B%EC%95%84%EC%9A%94%21","categoryNo":"36","parentCategoryNo":"5","sourceCode":"0","commentCount":"25","readCount":"","addDate":"2020. 5. 27.","openType":"2","searchYn":"true","greenReviewBannerYn":false,"memologMovingYn":"0","isPostSelectable":"0","isPostNotOpen":"0","isPostBlocked":0,"isBlockTmpForced":0,"postProductStatus":""},{"sellerServiceStatus":"","logNo":"221980554739","title":"%5B%EB%82%AE%EB%8F%84%EB%B0%A4%EC%95%88+%EB%B0%98%EB%B0%98%EC%82%BC%EA%B2%B9%EC%82%B4%5D+%EB%84%A4%EA%B0%80+%EB%8F%84%EC%8B%9C%EB%9D%BD%EC%9D%B4%EB%93%A0+%EC%95%88%EC%A3%BC%EB%93%A0+%EC%83%81%EA%B4%80%EC%97%86%EC%96%B4%21+%EB%A7%9B%EC%9E%88%EC%9C%BC%EB%8B%88%EA%B9%8C%21","categoryNo":"55","parentCategoryNo":"5","sourceCode":"0","commentCount":"27","readCount":"","addDate":"2020. 5. 27.","openType":"2","searchYn":"true","greenReviewBannerYn":false,"memologMovingYn":"0","isPostSelectable":"0","isPostNotOpen":"0","isPostBlocked":0,"isBlockTmpForced":0,"postProductStatus":""},{"sellerServiceStatus":"","logNo":"221980142921","title":"%5B%ED%8F%AC%ED%82%A4+%EC%B2%B4%EB%A6%AC%EB%B8%94%EB%9D%BC%EC%8D%B8%5D+%EB%B2%9A%EA%BD%83%EB%A7%9B%EC%9D%B4+%EA%B3%A0%EA%B5%AC%EB%A7%88%EB%A7%9B%ED%95%98%EA%B3%A0+%EB%B9%84%EC%8A%B7%ED%95%9C%EA%B0%80%EC%9A%94%3F","categoryNo":"38","parentCategoryNo":"5","sourceCode":"0","commentCount":"10","readCount":"","addDate":"2020. 5. 27.","openType":"2","searchYn":"true","greenReviewBannerYn":false,"memologMovingYn":"0","isPostSelectable":"0","isPostNotOpen":"0","isPostBlocked":0,"isBlockTmpForced":0,"postProductStatus":""},{"sellerServiceStatus":"","logNo":"221979816512","title":"%5B%EC%B9%B8%ED%83%80%ED%83%80+%EB%95%85%EC%BD%A9%ED%81%AC%EB%A6%BC%EB%9D%BC%EB%96%BC%5D+%EB%92%B7%EB%B6%81%EC%9D%B4%EB%9D%BC%EB%8F%84+%EC%A2%8B%EC%95%84%EC%9A%94.+%EC%A0%9C%EC%A3%BC%EB%8F%84+%EC%B9%B4%ED%8E%98+%ED%96%A5%EA%B8%B0%EB%A5%BC+%EC%9E%A0%EC%8B%9C%EB%82%98%EB%A7%88+%EB%8A%90%EB%82%84+%EC%88%98+%EC%9E%88%EC%96%B4%EC%84%9C.","categoryNo":"38","parentCategoryNo":"5","sourceCode":"0","commentCount":"20","readCount":"","addDate":"2020. 5. 26.","openType":"2","searchYn":"true","greenReviewBannerYn":false,"memologMovingYn":"0","isPostSelectable":"0","isPostNotOpen":"0","isPostBlocked":0,"isBlockTmpForced":0,"postProductStatus":""},{"sellerServiceStatus":"","logNo":"221979347585","title":"%5B%EB%8F%84%EC%82%B0%EA%B3%B5%EC%9B%90+%ED%85%8C%EB%8B%88%EC%8A%A4%EC%BD%94%ED%8A%B8+%EC%BB%A8%EC%85%89+%ED%8F%B4%ED%8A%B8%EB%B2%84%EA%B1%B0%5D+%EB%A7%A4%EC%BD%A4%ED%95%9C+%EB%B6%88%EB%A7%9B+%ED%8C%A8%ED%8B%B0%EA%B0%80+%EC%97%B4%EC%9D%BC%ED%96%88%EB%84%A4%21","categoryNo":"61","parentCategoryNo":"59","sourceCode":"0","commentCount":"19","readCount":"","addDate":"2020. 5. 26.","openType":"2","searchYn":"true","greenReviewBannerYn":false,"memologMovingYn":"0","isPostSelectable":"0","isPostNotOpen":"0","isPostBlocked":0,"isBlockTmpForced":0,"postProductStatus":""},{"sellerServiceStatus":"","logNo":"221978912711","title":"%5BMiSURA+%EC%98%81%EC%96%91%EB%B0%94%5D+%EC%95%84%EC%9D%B4%EB%93%A4+%EC%98%81%EC%96%91%EA%B0%84%EC%8B%9D%EC%9C%BC%EB%A1%9C+%EB%AF%B8%EC%A3%BC%EB%9D%BC%EB%8A%94+%EC%95%84%EC%A3%BC%EB%9D%BC%7E%21","categoryNo":"55","parentCategoryNo":"5","sourceCode":"0","commentCount":"19","readCount":"","addDate":"2020. 5. 26.","openType":"2","searchYn":"true","greenReviewBannerYn":false,"memologMovingYn":"0","isPostSelectable":"0","isPostNotOpen":"0","isPostBlocked":0,"isBlockTmpForced":0,"postProductStatus":""},{"sellerServiceStatus":"","logNo":"221978487711","title":"%5B%ED%92%80%EB%AC%B4%EC%9B%90+%EC%98%81%EC%96%91%EC%82%BC%EA%B3%84%ED%83%95%5D+%EB%8F%8C%EC%9D%84+%EC%94%B9%EC%96%B4%EB%A8%B9%EB%8A%94+%EB%82%98%EC%9D%B4%EC%9D%98+%EC%95%A0%EB%93%A4%EB%8F%84+%EB%BC%88%EB%8A%94+%EC%94%B9%EC%96%B4%EB%A8%B9%EA%B8%B0+%EC%8B%AB%EB%8D%B0%EC%9A%94%21+%EB%84%88%EB%AC%B4+%ED%91%B9+%EA%B3%A0%EC%95%98%EB%84%A4%EC%9A%94.","categoryNo":"55","parentCategoryNo":"5","sourceCode":"0","commentCount":"24","readCount":"","addDate":"2020. 5. 25.","openType":"2","searchYn":"true","greenReviewBannerYn":false,"memologMovingYn":"0","isPostSelectable":"0","isPostNotOpen":"0","isPostBlocked":0,"isBlockTmpForced":0,"postProductStatus":""},{"sellerServiceStatus":"","logNo":"221977759073","title":"%5B%ED%8A%B8%EB%A0%88%EC%9D%B4%EB%8D%94%EC%8A%A4+%EB%BC%88%EC%97%86%EB%8A%94+%EC%88%9C%EC%82%B4%EC%A1%B1%EB%B0%9C%5D+%EA%B0%80%EC%84%B1%EB%B9%84+%EC%A1%B1%EB%B0%9C%EC%9D%84+%EC%B0%BE%EB%8A%94%EB%8B%A4%EB%A9%B4+%EC%9A%94%EA%B1%B0%EC%97%90%EC%9A%94.+%EC%9A%94%EA%B1%B0%21","categoryNo":"38","parentCategoryNo":"5","sourceCode":"0","commentCount":"24","readCount":"","addDate":"2020. 5. 25.","openType":"2","searchYn":"true","greenReviewBannerYn":false,"memologMovingYn":"0","isPostSelectable":"0","isPostNotOpen":"0","isPostBlocked":0,"isBlockTmpForced":0,"postProductStatus":""},{"sellerServiceStatus":"","logNo":"221977674917","title":"%5B%EC%BD%94%EC%8A%A4%ED%8A%B8%EC%BD%94+%EC%83%81%EB%B4%89%EC%A0%90%5D+5%EC%9B%9425%EC%9D%BC%7E5%EC%9B%9431%EC%9D%BC+%ED%95%A0%EC%9D%B8%EC%A0%95%EB%B3%B4+%EB%B3%B4%EA%B3%A0%EA%B0%80%EC%84%B8%EC%9A%94.+%28%EC%8B%9D%ED%92%88%EB%A5%98%29","categoryNo":"60","parentCategoryNo":"59","sourceCode":"0","commentCount":"2","readCount":"","addDate":"2020. 5. 25.","openType":"2","searchYn":"true","greenReviewBannerYn":false,"memologMovingYn":"0","isPostSelectable":"0","isPostNotOpen":"0","isPostBlocked":0,"isBlockTmpForced":0,"postProductStatus":""},{"sellerServiceStatus":"","logNo":"221977625431","title":"%5B%EC%BD%94%EC%8A%A4%ED%8A%B8%EC%BD%94+%EC%83%81%EB%B4%89%EC%A0%90%5D+5%EC%9B%9425%EC%9D%BC%7E6%EC%9B%947%EC%9D%BC+%ED%95%A0%EC%9D%B8%EC%A0%95%EB%B3%B4+%EC%95%8C%EB%A0%A4%EB%93%9C%EB%A0%A4%EC%9A%94%21+%28%EC%8B%9D%ED%92%88%EB%A5%98+%EC%A0%9C%EC%99%B8%29","categoryNo":"60","parentCategoryNo":"59","sourceCode":"0","commentCount":"2","readCount":"","addDate":"2020. 5. 25.","openType":"2","searchYn":"true","greenReviewBannerYn":false,"memologMovingYn":"0","isPostSelectable":"0","isPostNotOpen":"0","isPostBlocked":0,"isBlo

 

3. 클롤링 PHP 페이지 제작

클래스 Imagelib2  그누보드 내장 클래스를 사용했다. 

라이브러리 curl 페이지를 가져오는데 필요한 라이브러리다. 

 

상단부 데이터베이스를 가져와 저장하는 루틴

$sql = "select last_num from last_num WHERE idx = 1 ORDER BY last_num DESC limit 1";
$row = sql_fetch($sql);
$idx = $row['last_num'] - 1;


//update('UPDATE last_num SET last_num = '.$idx);

$sql = " UPDATE last_num SET last_num = ".$idx." WHERE idx = 1";
sql_query($sql);

 

게시글 저장하는 함수

function insert_write($newpost)
{
	global $g5;

	//게시판 테이블 정보
	$bo_table = $newpost['bo_table'];
	if(!strlen($bo_table)) return FALSE; //bo_table 값이 지정되지 않았습니다.
	$board = sql_fetch(" select * from {$g5['board_table']} where bo_table = '$bo_table' ");
	if(!$board) return FALSE; //bo_table이 존재하지 않습니다.
	
	//회원정보 및 권한 확인
    $member = get_member($newpost['mb_id']);
    if(!$member) return FALSE; //mb_id가 존재하지 않습니다.
    //if($board[bo_write_level] > $member[mb_level]) return FALSE; //글쓰기 권한이 없습니다.
	
	//카테고리 설정
    $ca_name = $newpost['ca_name'];
	if ($ca_name && strpos($board['bo_category_list'], $ca_name) === FALSE){
		$category_list = $board['bo_category_list']."|".$ca_name;
		$sql = " update {$g5['board_table']} set bo_category_list = '$category_list' where bo_table = '$bo_table' ";
		sql_query($sql);
	}

	//변수 정리
    $write_table = $g5['write_prefix'].$bo_table;
    $wr_num = get_next_num($write_table);
    $ca_name = addslashes($ca_name);
    $html = "html1";
    $secret = "";
    $mail = "";
    $wr_subject = addslashes(trim($newpost['wr_subject']));
    $wr_content = addslashes(trim($newpost['wr_content']));
	$wr_1 = $newpost['wr_1'];
	$wr_2 = $newpost['wr_2'];

    if(!$wr_subject) return FALSE; //글 제목이 없습니다.
    if(!$wr_content) return FALSE; //글 내용이 없습니다.
    $mb_id = $member['mb_id'];
    $wr_password = $member['mb_password'];
    $wr_name = $board['bo_use_name'] ? $member['mb_name'] : $member['mb_nick'];
	$wr_email = $member['mb_email'];
	$wr_homepage = $member['mb_homepage'];
    for($i=1; $i<=10; $i++){
        $wr = "wr_{$i}";
        ${$wr} = addslashes($newpost[$wr]);
    }
    $wr_link1 = $newpost['wr_link1'];
    $wr_link2 = $newpost['wr_link2'];
	
	//글 입력하기
	$sql = " insert into $write_table
                set wr_num = '$wr_num',
                     wr_reply = '',
                     wr_comment = 0,
                     ca_name = '$ca_name',
                     wr_option = '$html,$secret,$mail',
                     wr_subject = '$wr_subject',
                     wr_content = '$wr_content',
                     wr_link1 = '$wr_link1',
                     wr_link2 = '$wr_link2',
                     wr_link1_hit = 0,
                     wr_link2_hit = 0,
                     wr_hit = 0,
                     wr_good = 0,
                     wr_nogood = 0,
                     mb_id = '$mb_id',
                     wr_password = '$wr_password',
                     wr_name = '$wr_name',
                     wr_email = '$wr_email',
                     wr_homepage = '$wr_homepage',
                     wr_datetime = '".G5_TIME_YMDHIS."',
                     wr_last = '".G5_TIME_YMDHIS."',
                     wr_ip = '{$_SERVER['REMOTE_ADDR']}',
                     wr_1 = '$wr_1',
                     wr_2 = '$wr_2',
                     wr_3 = '$wr_3',
                     wr_4 = '$wr_4',
                     wr_5 = '$wr_5',
                     wr_6 = '$wr_6',
                     wr_7 = '$wr_7',
                     wr_8 = '$wr_8',
                     wr_9 = '$wr_9',
                     wr_10 = '$wr_10' ";
    sql_query($sql);

//	print mysql_error();
    
//    $wr_id = mysql_insert_id();
   $wr_id = sql_insert_id();

    sql_query(" update $write_table set wr_parent = '$wr_id' where wr_id = '$wr_id' "); //부모 아이디에 UPDATE
	sql_query(" insert into {$g5['board_new_table']} ( bo_table, wr_id, wr_parent, bn_datetime, mb_id ) values ( '{$bo_table}', '{$wr_id}', '{$wr_id}', '".G5_TIME_YMDHIS."', '$mb_id' ) "); //새글 INSERT    
    sql_query(" update {$g5['board_table']} set bo_count_write = bo_count_write + 1 where bo_table = '{$bo_table}' "); //게시글 1 증가
	
	//return array('bo_table' => $bo_table, 'wr_id' => $wr_id, 'sca' => $ca_name);
	return TRUE;
	
}

 

이미지를 가져와 서버에 저장하는 클래스

<?php
/**
 * image 를 관리하는 class 입니다.
 */

// header('Content-Type: image/jpeg');
class Imagelib2
{

	var $uploads_dir = '/home/www/damuk/html/data';

    /**
     * 본문 내용중 외부 이미지주소를 서버로 가져온 후에 내부 주소로 변경합니다
     */
	public function Imagelib2() {

	}

    public function replace_external_image($content = '')
    {
        if (empty($content)) {
            return;
        }

        $patten = "/<img[^>]*src=[\"']?([^>\"']+)[\"']?[^>]*>/i";

        preg_match_all($patten, $content, $match);

        if (isset($match[1]) && $match[1]) {
            foreach ($match[1] as $link) {

 

제목과 카테고리 추출

//제목추출
preg_match_all('/<title>(.*?)<\/title>/is',$html, $matches);
$title = preg_replace('/: 네이버 블로그/','',$matches[1][0]);
$title = preg_replace('/&#39;/','\'',$title);

//카테고리 추출1
preg_match_all('/categoryName = \"(.*?)\"\;/is',$html, $matches);
$cate1 = preg_replace('//','',$matches[1][0]);

 

반응형

공유하기

facebook twitter kakaoTalk kakaostory naver band shouturl