Наверх

Парсинг новостей с картинками для сайта

<?php
$url = 'https://site.ru/type/news';

require_once(MODX_CORE_PATH . 'components/simplehtmldom/simple-html-dom.php');
$html = file_get_html($url);
$teasers = $html->find('.teaser');
$news = [];
foreach ($teasers as $teaser) {
    $link = $teaser->find('a', 0);
    $date = $teaser->find('.date', 0);
    $parser = date_create_from_format('d.m.y', trim($date->plaintext));
    $publishedon = date_format($parser, 'Y-m-d');
    $news[] = ['link' => 'https://site.ru' . $link->href, 'publishedon' => $publishedon, 'pagetitle' => $link->plaintext];
}
foreach ($news as $page) {
    $page['parent'] = 15;
    $html = file_get_html($page['link']);
    $content = $html->find('.body', 0);
    $imgs = $content->find('img');
    foreach ($imgs as $img) {
        if (strpos($img->src, 'http') !== false) {
            $url = $img->src;
        } else {
            $url = 'https://site.ru' . $img->src;
        }
        $name = md5(uniqid()) . '.' . pathinfo($url, PATHINFO_EXTENSION);
        $path = MODX_BASE_PATH . 'assets/images/archive/' . $name;
        file_put_contents($path, file_get_contents($url));
        $img->src = '/assets/images/archive/' . $name;
        if (!isset($page['img']) || !$page['img']) {
            $page['tv1'] = $img->src;
        }
    }
    if ($olds = $content->find('.field-field-old-filename',0)) {
        $olds->outertext = '';
    }
    $page['content'] = $content->innertext;
    if (!$res = $modx->getObject('modResource', ['parent' => $page['parent'], 'pagetitle' => $page['pagetitle'], 'publishedon' => strtotime($page['publishedon'])])) {
        if ($count = $modx->getCount('modResource', ['parent' => $page['parent'], 'pagetitle' => $page['pagetitle']])) {
            $page['alias'] = $page['pagetitle'] . '-' . $count;
            print $page['alias'] . ' - ';
        }
        $response = $modx->runProcessor('resource/create', $page);
        if ($response->isError()) {
            echo $page['pagetitle'] . '
';
            return $modx->error->failure($response->getMessage());
        } else {
            echo 'Resource created
';
        }
    } else {
        echo 'Resource found
';
    }
}

0 комментариев

    Авторизация

    через сервис Loginza:

    Подписка или RSS

    Буду присылать новые статьи — никакого спама



    Шаблоны MODX

    1 2 Дальше »

    Объектная
    модель
    MODX