import sys
from PyQt5.QtWidgets import QApplication, QMainWindow, QLineEdit, QPushButton, QVBoxLayout, QHBoxLayout, QWidget, QMessageBox, QFileDialog
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView, QWebEnginePage
import requests
from bs4 import BeautifulSoup
import os
import zipfile
from urllib.parse import urljoin, urlparse
class CustomWebEnginePage(QWebEnginePage):
def certificateError(self, certificate):
return True
class MainWindow(QMainWindow):
def __init__(self):
super().__init__()
self.setWindowTitle("Браузер")
self.setGeometry(100, 100, 800, 600)
self.central_widget = QWidget()
self.setCentralWidget(self.central_widget)
self.main_layout = QVBoxLayout(self.central_widget)
# Панель навигации (Назад, Вперед, Адресная строка)
navigation_layout = QHBoxLayout()
self.back_button = QPushButton("←")
navigation_layout.addWidget(self.back_button)
self.forward_button = QPushButton("→")
navigation_layout.addWidget(self.forward_button)
self.url_bar = QLineEdit()
navigation_layout.addWidget(self.url_bar)
self.url_bar.returnPressed.connect(self.load_url)
self.go_button = QPushButton("Перейти")
navigation_layout.addWidget(self.go_button)
self.main_layout.addLayout(navigation_layout)
self.browser = QWebEngineView()
self.browser.setPage(CustomWebEnginePage())
self.main_layout.addWidget(self.browser)
self.browser.urlChanged.connect(self.update_url_bar)
# Подключаем сигналы к кнопкам навигации после создания browser
self.back_button.clicked.connect(self.browser.back)
self.forward_button.clicked.connect(self.browser.forward)
# Кнопка "Копировать сайт"
self.copy_button = QPushButton("Копировать сайт")
self.copy_button.clicked.connect(self.copy_website)
self.main_layout.addWidget(self.copy_button)
self.current_url = ""
def update_url_bar(self, url):
self.url_bar.setText(url.toString())
self.current_url = url.toString()
def load_url(self):
url = self.url_bar.text()
if url:
self.current_url = url
self.browser.load(QUrl(url))
else:
QMessageBox.warning(self, "Предупреждение", "Пожалуйста, введите URL.")
def browser_back(self):
self.browser.back()
def browser_forward(self):
self.browser.forward()
def copy_website(self):
if not self.current_url:
QMessageBox.warning(self, "Предупреждение", "Пожалуйста, сначала перейдите на сайт.")
return
options = QFileDialog.Options()
download_dir = QFileDialog.getExistingDirectory(self, "Выберите папку для сохранения", "", options=options)
if not download_dir:
return
parsed_url = urlparse(self.current_url)
base_name = parsed_url.netloc
if not base_name:
base_name = "copied_website"
zip_filename = os.path.join(download_dir, f"{base_name}.zip")
QMessageBox.information(self, "Копирование", f"Начинается копирование сайта: {self.current_url} в архив: {zip_filename}")
page = self.browser.page()
try:
page.toHtml(lambda html: self.process_html(html, self.current_url, zip_filename))
except Exception as e:
QMessageBox.showerror(self, "Ошибка при получении HTML", str(e))
print(f"Произошла ошибка при получении HTML: {e}")
def process_html(self, html_content, base_url, zip_filename):
soup = BeautifulSoup(html_content, 'html.parser')
resources = []
parsed_base_url = urlparse(base_url)
base_netloc = parsed_base_url.netloc
for tag in soup.find_all(['link', 'script', 'img', 'source']):
if tag.name == 'link' and tag.has_attr('href'):
resources.append(tag['href'])
elif tag.name == 'script' and tag.has_attr('src'):
resources.append(tag['src'])
elif tag.name == 'img' and tag.has_attr('src'):
resources.append(tag['src'])
elif tag.name == 'source' and tag.has_attr('srcset'):
for src in tag['srcset'].split(','):
resources.append(src.strip().split()[0])
downloaded_paths = {}
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zf:
zf.writestr(os.path.join(base_netloc, 'index.html'), html_content.encode('utf-8'))
for resource_url in resources:
absolute_url = urljoin(base_url, resource_url)
parsed_resource_url = urlparse(absolute_url)
if parsed_resource_url.netloc == base_netloc or not parsed_resource_url.netloc:
resource_path = parsed_resource_url.path.lstrip('/')
archive_path = os.path.join(base_netloc, resource_path)
try:
response = requests.get(absolute_url, stream=True)
response.raise_for_status()
zf.writestr(archive_path, response.content)
downloaded_paths[resource_url] = archive_path
print(f"Сохранен ресурс: {absolute_url} -> {archive_path}")
except requests.exceptions.RequestException as e:
print(f"Ошибка при скачивании {absolute_url}: {e}")
updated_html = html_content
for original_url, new_path in downloaded_paths.items():
updated_html = self.update_html_links(updated_html, original_url, new_path)
zf.writestr(os.path.join(base_netloc, 'index.html'), updated_html.encode('utf-8'))
QMessageBox.information(self, "Копирование", f"Копирование завершено. Сайт сохранен в: {zip_filename}")
def update_html_links(self, html_content, original_url, new_path):
soup = BeautifulSoup(html_content, 'html.parser')
parsed_base_url = urlparse(self.current_url)
base_netloc = parsed_base_url.netloc
html_folder_path = os.path.join(base_netloc)
for tag in soup.find_all(['link', 'script', 'img', 'source']):
if tag.name == 'link' and tag.has_attr('href'):
original_href = tag['href']
absolute_original_href = urljoin(self.current_url, original_href)
absolute_downloaded_url = urljoin(self.current_url, original_url)
if absolute_original_href == absolute_downloaded_url:
if original_href.startswith('/'):
tag['href'] = os.path.join(base_netloc, original_href[1:]).replace('\\', '/') # Удалили ведущий слеш
else:
tag['href'] = os.path.relpath(new_path, html_folder_path).replace('\\', '/')
print(f"Обновлена CSS ссылка: {original_href} -> {tag['href']}")
elif tag.name == 'script' and tag.has_attr('src'):
original_src = tag['src']
absolute_original_src = urljoin(self.current_url, original_src)
absolute_downloaded_url = urljoin(self.current_url, original_url)
if absolute_original_src == absolute_downloaded_url:
if original_src.startswith('/'):
tag['src'] = os.path.join(base_netloc, original_src[1:]).replace('\\', '/') # Удалили ведущий слеш
else:
tag['src'] = os.path.relpath(new_path, html_folder_path).replace('\\', '/')
elif tag.name == 'img' and tag.has_attr('src'):
original_src = tag['src']
absolute_original_src = urljoin(self.current_url, original_src)
absolute_downloaded_url = urljoin(self.current_url, original_url)
if absolute_original_src == absolute_downloaded_url:
if original_src.startswith('/'):
tag['src'] = os.path.join(base_netloc, original_src[1:]).replace('\\', '/') # Удалили ведущий слеш
else:
tag['src'] = os.path.relpath(new_path, html_folder_path).replace('\\', '/')
elif tag.name == 'source' and tag.has_attr('srcset'):
srcset_values = []
for src in tag['srcset'].split(','):
s = src.strip().split()
url = s[0]
absolute_original_srcset_url = urljoin(self.current_url, url)
absolute_downloaded_url = urljoin(self.current_url, original_url)
if absolute_original_srcset_url == absolute_downloaded_url:
if url.startswith('/'):
srcset_values.append(os.path.join(base_netloc, url[1:]).replace('\\', '/') + (f' {s[1]}' if len(s) > 1 else '')) # Удалили ведущий слеш
else:
srcset_values.append(os.path.relpath(new_path, html_folder_path).replace('\\', '/') + (f' {s[1]}' if len(s) > 1 else ''))
else:
srcset_values.append(src)
tag['srcset'] = ', '.join(srcset_values)
return html_content
if __name__ == '__main__':
app = QApplication(sys.argv)
main_window = MainWindow()
main_window.show()
sys.exit(app.exec_())